MastodonHelp/web/clitools/crawler.php
2022-12-12 08:17:01 +01:00

1286 lines
53 KiB
PHP
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
const N="\n";
require(__DIR__.'/../site/mustard/include/getfc.php');
require(__DIR__.'/lib/vendor/autoload.php');
use LanguageDetection\Language;
(strtoupper(substr(PHP_OS,0,3))==='WIN') ? $iswin=true : $iswin=false;
function eecho($lev,$msg) {
$time=microtime(false);
$time=explode(' ',$time);
$time=date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2);
$levs=['Debug', 'Info', 'Warning', 'Error'];
$msg=$time.' '.$levs[$lev].': '.$msg;
if ($lev<2)
echo($msg);
else
fwrite(STDERR,$msg);
}
function mexit($msg,$code) {
global $link, $jsonf, $lockfp;
if (isset($link)) mysqli_close($link);
if (isset($jsonf)) fclose($jsonf);
if (isset($lockfp) && is_file($lockfp)) unlink($lockfp);
if ($code!=0)
eecho(3,$msg);
else
eecho(1,$msg);
exit($code);
}
declare(ticks=1);
if (function_exists('pcntl_signal')) {
function signalHandler($signal) {
echo(N);
mexit('received signal «'.$signal.'», shutting down.'.N,0);
}
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
}
$opts=[
'timeout'=>10,
'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead
'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new
'ldtoots'=>40,// number of toots to check with the automatic language detection function
'setnew'=>true,
'dryrun'=>false,
'jsonfp'=>__DIR__.'/instances.json',
'jsonwrite'=>false,
'peersfp'=>null,
'dontrestore'=>false,
'ignorelock'=>false,
'fetchusers'=>false,
'moreclauses'=>''
];
$help='crawler.php
DESCRIPTION
This script updates mastostarts database with the data it manages to
retrieve from instances already present in the database plus (optionally)
those listed in a specifiable file (typically the output file from a
peerscrawl.php run).
SYNOPSIS
crawler.php [options]
OPTIONS
-p, --peersfp <file>
Sets a file containing a list of instances to consider in addition to those
which are already present in the database.
Note that this option is ignored if the script will recover a previous
unfinished session.
-f, --fetchusers
*Currently experimental*: if this option is set, the script will try and
fetch users profiles infos from each considered instances user directory
and store them in the database.
-t, --timeout <seconds>
Sets the timeout in seconds for every connection attempt.
DEFAULT: «'.$opts['timeout'].
-N, --dontsetnew
If this option is set, the script wont mark new instances as new. This can
be useful for a first run.
-I, --ignorelock
Normally, if its lockfile exists, the script will exit with an error.
If this option is set, the lockfile existence will be ignored.
Warning: check that the script is actually not running yet before using
this option.
-R, --dontrestore
If this option is set and «instances.job» and «currinst.job» files from
a previous unfinished session are present, the script will ignore them
and start a new session.
-d, --dryrun
If this option is set, the script wont write anything in the database.
-j, --jsonwrite
If this option is set, the script will write an «instances.json» file
containing all the data it could retrieve from every considered instance.
-m, --moreclauses <more SQL clauses>
If this option is set, whatever one writes as argument to the option will
be added to the main query for instances records, which is
«SELECT URI FROM Instances WHERE Dead=0», so one can limit the crawl more.
-h, --help
If this option is set, the script will show this help text and exit.
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under
certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
for ($i=1; $i<$argc; $i++) {
if (substr($argv[$i],0,1)=='-') {
switch($argv[$i]) {
case '-p':
case '--peersfp':
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('option «'.$argv[$i].'» requires an existing and readable file as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-f':
case '--fetchusers':
$opts['fetchusers']=true;
break;
case '-t':
case '--timeout':
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
mexit('option «'.$argv[$i].'» requires a numeric argument (use «-h» to read help).'.N,1);
$i++;
$opts['timeout']=$argv[$i]+0;
break;
case '-N':
case '--dontsetnew':
$opts['setnew']=false;
break;
case '-R':
case '--dontrestore':
$opts['dontrestore']=true;
break;
case '-I':
case '--ignorelock':
$opts['ignorelock']=true;
break;
case '-d':
case '--dryrun':
$opts['dryrun']=true;
break;
case '-j':
case '--jsonwrite':
$opts['jsonwrite']=true;
break;
case '-m':
case '--moreclauses':
if ($i+1>=$argc)
mexit('option «'.$argv[$i].'» requires some SQL clause as argument (use «-h» to read help).'.N,1);
$i++;
$opts['moreclauses']=$argv[$i];
break;
case '-h':
case '--help':
echo($help);
exit(0);
break;
default:
mexit('option «'.$argv[$i].'» is unknown (use «-h» to read help).'.N,1);
break;
}
}
}
use function mysqli_real_escape_string as myesc;
function myq(&$link,$query,$line) {
try {
$res=mysqli_query($link,$query);
}
catch (Exception $error) {
mexit('query «'.$query.'» (line '.$line.') failed: '.$error->getMessage().N,3);
}
return($res);
}
$lockfp=__DIR__.'/crawler.lock';
if (file_exists($lockfp) && !$opts['ignorelock']) {
eecho(3,'lock file «'.$lockfp.'» exists (if you are sure crawler.php is not already running you can use option «-I» to force execution).'.N);
exit(1);
}
touch($lockfp);
$inifp=__DIR__.'/../conf/mustard.ini';
$iniarr=@parse_ini_file($inifp)
or mexit('could not open config file «'.$inifp.'»'.N,1);
$link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket'])
or mexit('could not connect to MySQL server: '.mysqli_connect_error().N,1);
mysqli_set_charset($link,'utf8mb4')
or mexit('could not set «utf8mb4» charset fro MySQL: '.mysqli_error($link).N,1);
require(__DIR__.'/../site/mustard/include/tables.php');
$tables=tables($link);
//print_r($tables);
$recover=false;
$instsjfp=__DIR__.'/instances.job';
$currinstjfp=__DIR__.'/currinst.job';
if (!$opts['dontrestore'] && file_exists($currinstjfp) && file_exists($instsjfp)) {
eecho(0,'looks like previous session was interrupted, trying to recover it...'.N);
$buf=@file($instsjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES)
or mexit('could not open file «'.$instsjfp.'» for reading.'.N,1);
$insts=array();
foreach ($buf as $line)
$insts[]=$line;
$buf=@file($currinstjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES)
or mexit('could not open file «'.$currinstjfp.'» for reading.'.N,1);
$buf=explode("\t",$buf[0]);
$currinst=array('dom'=>$buf[0], 'i'=>$buf[1], 'qok'=>$buf[2], 'qgood'=>$buf[3]);
$recover=true;
eecho(1,'recovered previous session.'.N);
}
function truncs($str,$tab,$col,$ctx) {
global $tables, $iswin;
if (is_null($str)) return(null);
if ($iswin)
$tab=strtolower($tab);
$size=$tables[$tab][$col];
$len=mb_strlen($str,'UTF-8');
if ($len>$size) {
$str=mb_substr($str,0,$size-1,'UTF-8').'…';
notify($ctx.': had to truncate string to '.$size.' chars to be able to insert it into «'.$col.'» column in «'.$tab.'» table.',3);
}
return($str);
}
function truncn($num,$tab,$col,$ctx) {
global $tables, $iswin;
if ($iswin)
$tab=strtolower($tab);
if (is_numeric($num)) {
if ($num>$tables[$tab][$col]['max']) {
notify($ctx.': had to ceil «'.$num.'» to «'.$tables[$tab][$col]['max'].'», ie the maximum value it can have in column «'.$col.'» of table «'.$tab.'».',3);
$num=$tables[$tab][$col]['max'];
} elseif ($num<$tables[$tab][$col]['min']) {
notify($ctx.': had to floor «'.$num.'» to «'.$tables[$tab][$col]['min'].'», ie the minimum value it can have in column «'.$col.'» of table «'.$tab.'»).',3);
$num=$tables[$tab][$col]['min'];
}
} else {
notify($ctx.': function «truncn»: expecting a number, got something else; returning «0».',3);
$num=0;
}
return($num);
}
/*$contextopts=array(
'http'=>array(
'timeout'=>$opts['timeout']
),
'socket'=>array(
'tcp_nodelay'=>true
)
);
$context=stream_context_create($contextopts);*/
function pgdatetomy($pgdate) {
//2018-04-07T15:05:26.801Z
if (preg_match('/^(\d+)-(\d+)-(\d+)[ T]{1}(\d+):(\d+):(\d+)(\.\d+)?Z?$/',$pgdate,$buf)===1) {
$mtime=gmmktime($buf[4],$buf[5],$buf[6],$buf[2],$buf[3],$buf[1]);
if (array_key_exists(7,$buf))
$mtime=$mtime+floatval('0'.$buf[7]);
return($mtime);
} else {
notify('Function «pgdatetomy»: «'.$pgdate.'» has not a recognized date format; returning current date.',3);
return(time());
}
}
if (!$recover) {
$insts=array();
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=0'.$opts['moreclauses'],__LINE__);
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$insts))
$insts[]=$row['URI'];
eecho(1,'loaded known, alive instances from the database into the list of instances to be checked.'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=1',__LINE__);
$deadinsts=array();
while($row=mysqli_fetch_assoc($res))
$deadinsts[]=$row['URI'];
eecho(1,'loaded dead instances into the corresponding list.'.N);
if (!is_null($opts['peersfp'])) {
eecho(0,'loading other instances to be checked from «'.$opts['peersfp'].'».'.N);
$peers=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($peers===false)
mexit('could not open «'.$opts['peersfp'].'» for reading.'.N,1);
foreach ($peers as $pdom) {
if (!in_array($pdom,$insts))
if (!in_array($pdom,$deadinsts))
if (!willtrunc($pdom,'Instances','URI'))
$insts[]=$pdom;
else
eecho(2,'ignoring instance «'.$pdom.'» because its hostname is too long for column «URI» of table «Instances».'.N);
else
eecho(1,'ignoring instance «'.$pdom.'» because it is dead.'.N);
}
}
unset($deadinsts);
sort($insts);
// shuffle($insts);
eecho(1,count($insts).' instances to be checked.'.N);
$instsf=@fopen($instsjfp,'w')
or mexit('could not open «'.$instsjfp.'» for writing.'.N,1);
foreach ($insts as $host)
fwrite($instsf,$host.N);
fclose($instsf);
}
function willtrunc($str,$tab,$col) {
global $tables, $iswin;
if ($iswin)
$tab=strtolower($tab);
if (mb_strlen($str,'UTF-8')>$tables[$tab][$col])
return(true);
else
return(false);
}
function b2i($bool,$pre) {
if (is_bool($bool)) {
if ($bool)
return(1);
else
return(0);
} else {
notify($pre.'«'.$bool.'» is not a boolean value, returning «0».',3);
return(0);
}
}
//is array, array key exists and value is not null
function akeavinn($key,&$arr) {
if (is_array($arr) && array_key_exists($key,$arr) && !is_null($arr[$key]))
return(true);
else
return(false);
}
function nempty($str) {
if (preg_match('/^\s*$/',$str)===1)
return(null);
else
return($str);
}
function subarimp($glue,$key,&$arr) {
$str='';
$i=1;
$carr=count($arr);
foreach ($arr as $inarr) {
$str.=$inarr[$key];
if ($i<$carr)
$str.=$glue;
$i++;
}
return($str);
}
function notify($msg,$sev) {
// notify "Severity" should be called "Importance"; anyway, it is to be thought of as "$lev" param of function "eecho": 0=debug, 1=info, 2=warning, 3=error
global $link, $tables, $iswin, $opts;
eecho($sev,'*notification*: '.strip_tags($msg).N);
$tab='Notifications';
if ($iswin) $tab='notifications';
if (!$opts['dryrun']) myq($link,'INSERT INTO Notifications (ID, Notification, Severity, Microtime, Seen, Deleted) VALUES (NULL, \''.myesc($link,mb_substr($msg,0,$tables[$tab]['Notification'],'UTF-8')).'\', '.$sev.', \''.microtime(true).'\', 0, 0)',__LINE__);
}
/** <LANGUAGE MANAGEMENT> */
/**
* Executes a call to Mastodon API.
*
* @param string $host Host to be called (e.g.: "mastodon.bida.im")
* @param string $path API path (e.g.: "/api/v1/timelines/public?local=true")
* @return mixed An array representing the JSON object as returned by json_decode, or NULL if the call fails
*/
function get_api($host, $path) {
global $opts;
$buf = @getfc('https://'.$host.$path,$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
$data = json_decode($buf['cont'], true);
return $data;
} else {
return NULL;
}
}
/**
* Returns a list of known recognized languages, with the related probability, fot the toot that got passed to it
*
* @param mixed $toot The toot to be checked, as returned by the API
* @return array Associative array with language and related probability
*/
function get_toot_languages($toot) {
if (is_array($toot) && array_key_exists('language',$toot))
$l = $toot['language'];
else
$l = NULL;
if($l !== NULL) {
// the language is explicitly set in the toot, so use that
$langs[$l] = 1;
} elseif (array_key_exists('content',$toot)) {
// the language is not explicitly set in the toot, so try and recognize it
$text = strip_tags($toot['content']);
$ld = new Language;
$langs = $ld->detect($text)->bestResults()->close();
}
// group derived languages into two-charactes language code (e.g.: "zh-CN" into "zh")
$grouped_langs = array();
foreach($langs as $key => $value) {
$l = explode("-", $key)[0];
if(array_key_exists($l, $grouped_langs)) {
$grouped_langs[$l] = max($grouped_langs[$l], $value);
} else {
$grouped_langs[$l] = $value;
}
}
return $grouped_langs;
}
/**
* Given the probability of a language for every toot, calculate the average
*
* @param array $detected_langs Array of mappings between language and probability
* @return array Mapping between language and probability
*/
function summary($detected_langs) {
$res = Array();
foreach($detected_langs as $langs) {
foreach($langs as $l => $weight) {
if(!array_key_exists($l, $res)) {
$res[$l] = 0;
}
$res[$l] += $weight;
}
}
foreach($res as $l => $sumweight) {
$res[$l] = $sumweight / count($detected_langs);
}
return $res;
}
/**
* Helper function for usort: compares two arrays using the first element
*
* @param array $entry1 First array to be compared
* @param array $entry2 Second array to be compared
* @return number -1, 0 o 1 depening on $entry1[0] being less than, equal to or greater than $entry2[0]
*/
function sort_weights($entry1, $entry2) {
$w1 = $entry1[0];
$w2 = $entry2[0];
if ($w1 < $w2)
$ret=1;
elseif ($w1 == $w2)
$ret=0;
else
$ret=-1;
return $ret;
}
/**
* Given a language mapping, return a list of probable languages
*
* @param array $summary Map between language and probabilty
* @return string[] List of probable languages
*/
function get_languages($summary) {
$lst = [];
foreach($summary as $code => $weight) {
$lst[] = [$weight, $code];
}
usort($lst, 'sort_weights');
$languages = [];
$lastweight = 0;
foreach($lst as $entry) {
$l = $entry[1];
$weight = $entry[0];
if($weight < $lastweight * 2 / 3) {
break;
}
$languages[] = $l;
$lastweight = $weight;
}
return $languages;
}
/**
* Returns a list of probable languages for the given instance
*
* @param string $host Instances hostname (e.g.: "mastodon.bida.im")
* @return string[] List of probable languages
*/
function get_instance_langs($host) {
global $opts;
$data = get_api($host, '/api/v1/timelines/public?local=true&limit='.$opts['ldtoots']);
if($data == NULL) {
return [];
}
$detected_langs = array_map('get_toot_languages', $data);
$summary = summary($detected_langs);
$languages = get_languages($summary);
return $languages;
}
require(__DIR__.'/../site/mustard/include/mb_ucfirst.php');
function langs($instid, $uri, $auto) {
global $info, $instrow, $link, $opts;
$retlangs=array();
$languages=array();
// even if $auto is true, set it to false (don't do autodection of languages based on last toots) if api/v1/instance returned a language different from the default "en": assume instead it is right, because it has been explicitly set
if (isset($info['languages'][0]) && $info['languages'][0]!='en')
$auto=false;
if ($auto) {
$languages = get_instance_langs($uri);
} elseif (akeavinn('languages',$info)) {
$languages = $info['languages'];
}
if (count($languages)==0) {
return($retlangs);
} else {
foreach ($languages as $key=>$val)
$languages[$key]=str_replace('-','_',$val);
if ($auto)
eecho(1,'detected languages: '.implode(', ',$languages).N);
else
eecho(1,'declared languages: '.implode(', ',$languages).N);
$pos=0;
foreach($languages as $lang) {
$res=myq($link,'SELECT * FROM Languages WHERE Code=\''.myesc($link,$lang).'\'',__LINE__);
if (mysqli_num_rows($res)<1) {
$code=myesc($link,truncs($lang,'Languages','Code','«'.$instrow['URI'].'»'));
$NameOrig=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,$lang)),'Languages','NameOrig','«'.$instrow['URI'].'»'));
$NamePt_BR=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'pt_BR')),'Languages','NamePT_BR','«'.$instrow['URI'].'»'));
$NameDe=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'de')),'Languages','NameDE','«'.$instrow['URI'].'»'));
$NameUk=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'uk')),'Languages','NameUK','«'.$instrow['URI'].'»'));
$NameCa=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'ca')),'Languages','NameCA','«'.$instrow['URI'].'»'));
$NameEn=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'en')),'Languages','NameEN','«'.$instrow['URI'].'»'));
$NameEs=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'es')),'Languages','NameES','«'.$instrow['URI'].'»'));
$NameFr=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'fr')),'Languages','NameFR','«'.$instrow['URI'].'»'));
$NameGl=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'gl')),'Languages','NameGL','«'.$instrow['URI'].'»'));
$NameIt=myesc($link,truncs(mb_ucfirst(locale_get_display_name($lang,'it')),'Languages','NameIT','«'.$instrow['URI'].'»'));
$q = 'INSERT INTO Languages (ID, Code, NameOrig, NamePT_BR, NameDE, NameUK, NameCA, NameEN, NameES, NameFR, NameGL, NameIT) VALUES (NULL, \''.$code.'\', \''.$NameOrig.'\', \''.$NamePt_BR.'\',, \''.$NameDe.'\', \''.$NameUk.'\', \''.$NameCa.'\', \''.$NameEn.'\', \''.$NameEs.'\', \''.$NameFr.'\', \''.$NameGl.'\', \''.$NameIt.'\')';
if (!$opts['dryrun']) {
myq($link,$q,__LINE__);
$langid=mysqli_insert_id($link);
} else {
$langid=0;
}
} else {
$row=mysqli_fetch_assoc($res);
$langid=$row['ID'];
}
$pos++;
$retlangs[]=array('InstID'=>$instid,'LangID'=>$langid,'Pos'=>$pos,'Code'=>$lang);
}
}
return($retlangs);
}
function varbdump($var) {
ob_start();
var_dump($var);
$content=ob_get_contents();
ob_end_clean();
return($content);
}
function mdasortbykey(&$arr,$key,$rev=false) {
$karr=array();
foreach ($arr as $akey=>$subarr)
$karr[round($subarr[$key]*10000000000000,0)]=array($akey,$subarr);
if (!$rev)
ksort($karr);
else
krsort($karr);
$arr=array();
foreach ($karr as $akey=>$subarr)
$arr[$subarr[0]]=$subarr[1];
}
require(__DIR__.'/../site/mustard/include/ghs.php');
require(__DIR__.'/../site/mustard/include/ght.php');
/*
* Nodeinfo ('https://'.$host.'/nodeinfo/2.0.json') was added in v3.0.0
* Trends ('https://'.$host.'/api/v1/trends') was added in v3.0.0
* Activity ('https://'.$host.'/api/v1/instance/activity') was added in v2.1.2
*/
if ($opts['jsonwrite']) {
if ($recover)
$mode=array('a','append');
else
$mode=array('w','write');
$jsonf=@fopen($opts['jsonfp'],$mode[0])
or mexit('could not open file «'.$opts['jsonfp'].'» in '.$mode[1].' mode.',1);
if ($mode[0]=='w')
fwrite($jsonf,'{'.N);
}
//$insts=['damze.umbrella.cafe'];
$tini=time();
$cinsts=count($insts);
$i=0;
$qok=0;
$qgood=0;
if ($recover) {
$i=$currinst['i'];
$qok=$currinst['qok'];
$qgood=$currinst['qgood'];
}
$beg=$i;
while ($i<$cinsts) {
$now=time();
$host=$insts[$i];
@file_put_contents($currinstjfp,$host."\t".$i."\t".$qok."\t".$qgood.N)
or mexit('could not open «'.$currinstjfp.'» for writing.',1);
$i++;
$ismast=null;
$instans=true;
$info=null;
$tela=$now-$tini;
eecho(1,'working on «'.$host.'»; '.$i.'/'.$cinsts.'; '.$qok.' ok; '.$qgood.' good; '.round(100/$cinsts*$i).'%; elapsed time: '.ght($tela,null,0).'; estimated remaining time: '.ght($tela/$i*($cinsts-$beg)-$tela,null,0).'; mem.: '.ghs(memory_get_usage(true)).'; mem. peak: '.ghs(memory_get_peak_usage(true)).N);
if (willtrunc($host,'Instances','URI')) {
eecho(2,'«'.$host.'»: ignoring it because hostname is too long for the «URI» column of «Instances» table.'.N);
} else {
eecho(0,'«'.$host.'»: trying to fetch instance info from API...'.N);
$buf=@getfc('https://'.$host.'/api/v1/instance',$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
$info=@json_decode($buf['cont'],true);
if (is_array($info)) {
eecho(1,'«'.$host.'»: got instance info from API :-)'.N);
eecho(0,'«'.$host.'»: trying to fetch nodeinfo specs on https...'.N);
$buf=@getfc('https://'.$host.'/.well-known/nodeinfo',$opts['timeout']);
if ($buf['cont']===false) {
eecho(0,'«'.$host.'»: trying to fetch nodeinfo specs on http...'.N);
$buf=@getfc('http://'.$host.'/.well-known/nodeinfo',$opts['timeout']);
}
if ($buf['cont']!==false) {
$buf=@json_decode($buf['cont'],true);
if (is_array($buf) && array_key_exists('links',$buf) && is_array($buf['links']) && count($buf['links'])>0) {
$nirefs=[];
foreach ($buf['links'] as $key=>$niref)
if (isset($niref['rel']) && isset($niref['href']))
$nirefs[$niref['rel']]=$niref['href'];
else
eecho(2,'«'.$host.'»: nodeinfo specs link '.$key.' has unexpected format.'.N);
krsort($nirefs);
$niref=array_shift($nirefs);
eecho(0,'«'.$host.'»: got nodeinfo specs; trying to fetch nodeinfo...'.N);
$buf=@getfc($niref,$opts['timeout']);
if ($buf['cont']!==false) {
$buf=@json_decode($buf['cont'],true);
if (is_array($buf) && isset($buf['software']['name']) && isset($buf['software']['version'])) {
$info['x-nodeinfo']=$buf;
if (preg_match('/^mastodon|fedibird|ecko|hometown/',$info['x-nodeinfo']['software']['name'])===1)
$ismast=true;
$res=myq($link,'SELECT Name FROM Platforms WHERE Name=\''.myesc($link,$info['x-nodeinfo']['software']['name']).'\'',__LINE__);
if (mysqli_num_rows($res)<1) {
if (!$opts['dryrun']) myq($link,'INSERT INTO Platforms (Name) VALUES (\''.myesc($link,truncs($info['x-nodeinfo']['software']['name'],'Platforms','Name','«'.$host.'»')).'\')',__LINE__)
or mexit(__LINE__.': '.mysqli_error($link).N,3);
notify('New software found: «'.$host.'» runs on «'.$info['x-nodeinfo']['software']['name'].'»; i added it to the table of known softwares. It would be good to check whether it is a Mastodon derivate and how compatible it is, to decide whether to consider instances using it as Mastodon instances.',2);
}
} else {
eecho(2,'«'.$host.'»: nodeinfo was not good json or json had unexpected format.'.N);
}
}
} else {
eecho(2,'«'.$host.'»: nodeinfo specs where not good json or json had unexpected format.'.N);
}
}
if (array_key_exists('version',$info)) {
eecho(1,'«'.$host.'» software version is «'.$info['version'].'».'.N);
if ($info['version']>='2.1.2') {
eecho(0,'«'.$host.'»: trying to fetch instance activity info from API...'.N);
$buf=@getfc('https://'.$host.'/api/v1/instance/activity',$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
eecho(1,'«'.$host.'»: got instance activity info from API :-)'.N);
$info['x-activity']=json_decode($buf['cont'],true);
} else {
eecho(2,'«'.$host.'»: could not fetch instance activity from API: '.$buf['emsg'].N);
}
}
if ($info['version']>='3.0.0') {
eecho(0,'«'.$host.'»: trying to fetch instance trends info from API...'.N);
$buf=@getfc('https://'.$host.'/api/v1/trends',$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
eecho(1,'«'.$host.'»: got instance trends info from API :-)'.N);
$info['x-trends']=json_decode($buf['cont'],true);
} else {
eecho(2,'«'.$host.'»: could not fetch instance trends from API: '.$buf['emsg'].N);
}
}
}
} else {
$instans=false;
eecho(2,'«'.$host.'»: fetched data were not good JSON.'.N);
}
} else {
$instans=false;
eecho(2,'«'.$host.'»: could not fetch instance info from API: '.$buf['emsg'].N);
}
if (!isset($info['uri']) || preg_match('#^\s*$#',$info['uri'])===1)
$instans=false;
if (is_array($info) && count($info)>0) {
//echo('json dump of all fetched info:'.N.json_encode($info,JSON_PRETTY_PRINT).N);
if ($opts['jsonwrite'])
fwrite($jsonf,'"'.$host.'": '.json_encode($info,JSON_PRETTY_PRINT).','.N);
}
if (!$instans) {
// this is the limbo of non-responding instances
$res=myq($link,'SELECT * FROM Instances WHERE URI=\''.myesc($link,$host).'\'',__LINE__);
$nrows=mysqli_num_rows($res);
if ($nrows==1) {
eecho(1,'«'.$host.'»: didnt respond, but it is present in the database; updating InstChecks, Instances.LastCheckOk and possibly Instances.New=0 and Instances.Dead=1.'.N);
$row=mysqli_fetch_assoc($res);
$instid=$row['ID'];
if (!$opts['dryrun']) mysq($link,'UPDATE Instances SET LastCheckOk=0 WHERE ID='.$instid,__LINE__);
if ($row['New']==1 && !is_null($row['FirstSeen']) && $now-$row['FirstSeen']>$opts['oldline']) {
notify('Instance «<a href="viewinst.php?id='.$instid.'">'.$row['URI'].'</a>» is no longer new.',2);
if (!$opts['dryrun']) myq($link,'UPDATE Instances SET New=0 WHERE ID='.$instid,__LINE__);
}
// we check the last time instance responded, if ever
$rres=myq($link,'SELECT Time FROM InstChecks WHERE InstID='.$instid.' AND Status=1 ORDER BY Time DESC LIMIT 1',__LINE__);
// if instance never responded we consider the time of first check
if (mysqli_num_rows($rres)==0)
$rres=myq($link,'SELECT Time FROM InstChecks WHERE InstID='.$instid.' AND Status=0 ORDER BY Time ASC LIMIT 1',__LINE__);
if (mysqli_num_rows($rres)>0) {
$rrow=mysqli_fetch_assoc($rres);
if ($now-$rrow['Time']>$opts['deadline']) {
if (!$opts['dryrun']) myq($link,'UPDATE Instances SET Dead=1 WHERE ID='.$instid,__LINE__);
notify('Instance «<a href="viewinst.php?id='.$instid.'">'.$row['URI'].'</a>» is dead!',2);
}
} else {
eecho(2,'«'.$host.'»: exists in the database but theres no data about it in InstChecks!'.N);
}
} elseif ($nrows==0) {
eecho(1,'«'.$host.'»: doesnt respond and is not in the database, adding it.'.N);
// "New=0" and "FirstSeen=NULL" because it's not new and not seen until it responds for the first time
if (!$opts['dryrun']) {
myq($link,'INSERT INTO Instances SET FirstSeen=NULL, New=0, Good=0, Chosen=0, Visible=0, Noxious=0, URI=\''.myesc($link,$host).'\', LastCheckOk=0, InsertTS='.$now,__LINE__);
$instid=mysqli_insert_id($link);
myq($link,'INSERT INTO InstChecks SET InstID='.$instid.', Time='.$now.', Status=0',__LINE__);
} else {
$instid=0;
}
} else {
notify('Instance «'.$host.'» has '.$nrows.' entries in «Instances» table!',3);
}
if (!$opts['dryrun']) myq($link,'INSERT INTO InstChecks (InstID, Time, Status) VALUES ('.$instid.', '.$now.', 0)',__LINE__);
} else {
// instance responded
if (is_null($ismast)) {
if (!array_key_exists('version',$info)) {
$ismast=null;// redundant, just to put there something
} elseif (array_key_exists('pleroma',$info)) {
$ismast=false;
} elseif (preg_match('#(compatible|pleroma|pixelfed)#i',$info['version'])==1) {
$ismast=false;
} elseif (preg_match('#^[0-9]+\.[0-9]+\.[0-9]+#',$info['version'])!==1) {
$ismast=false;
} else {
$ismast=true;
}
}
$qok++;
if (!is_null($ismast))
($ismast) ? $ismast=1 : $ismast=0;
$instrow=array('ID'=>null, 'FirstSeen'=>null, 'IsMastodon'=>$ismast, 'Dead'=>0, 'New'=>0, 'Good'=>0, 'Chosen'=>0, 'Priority'=>null, 'Visible'=>0, 'Noxious'=>0, 'NoxReason'=>null, 'NoxLastModTS'=>null, 'URI'=>null, 'Title'=>null, 'ShortDesc'=>null, 'LongDesc'=>null, 'OurDesc'=>null, 'OurDescEN'=> null, 'LocalityID'=>null, 'OurLangsLock'=>0, 'Email'=>null, 'Software'=>null, 'Version'=>null, 'UserCount'=>null, 'StatusCount'=>null, 'DomainCount'=>null, 'ActiveUsersMonth'=>null, 'ActiveUsersHalfYear'=>null, 'Thumb'=>null, 'RegOpen'=>null, 'RegReqApproval'=>null, 'MaxTootChars'=>null, 'AdmAccount'=>null, 'AdmDisplayName'=>null, 'AdmCreatedAt'=>null, 'AdmNote'=>null, 'AdmURL'=>null, 'AdmAvatar'=>null, 'AdmHeader'=>null, 'LastCheckOk'=>1, 'GuestID'=>null, 'LastGuestEdit'=>null);
$instrow['URI']=$host;
if (akeavinn('title',$info))
$instrow['Title']=nempty(truncs($info['title'],'Instances','Title','«'.$instrow['URI'].'»'));
if (akeavinn('short_description',$info))
$instrow['ShortDesc']=nempty(truncs($info['short_description'],'Instances','ShortDesc','«'.$instrow['URI'].'»'));
if (akeavinn('description',$info))
$instrow['LongDesc']=nempty(truncs($info['description'],'Instances','LongDesc','«'.$instrow['URI'].'»'));
if (akeavinn('email',$info))
$instrow['Email']=nempty(truncs($info['email'],'Instances','Email','«'.$instrow['URI'].'»'));
if (akeavinn('version',$info))
$instrow['Version']=nempty(truncs($info['version'],'Instances','Version','«'.$instrow['URI'].'»'));
if (akeavinn('stats',$info)) {
if (akeavinn('user_count',$info['stats']))
$instrow['UserCount']=truncn($info['stats']['user_count'],'Instances','UserCount','«'.$instrow['URI'].'»');
if (akeavinn('status_count',$info['stats']))
$instrow['StatusCount']=truncn($info['stats']['status_count'],'Instances','StatusCount','«'.$instrow['URI'].'»');
if (akeavinn('domain_count',$info['stats']))
$instrow['DomainCount']=truncn($info['stats']['domain_count'],'Instances','DomainCount','«'.$instrow['URI'].'»');
}
if (akeavinn('thumbnail',$info))
$instrow['Thumb']=nempty(truncs($info['thumbnail'],'Instances','Thumb','«'.$instrow['URI'].'»'));
if (akeavinn('max_toot_chars',$info))
$instrow['MaxTootChars']=truncn($info['max_toot_chars'],'Instances','MaxTootChars','«'.$instrow['URI'].'»');
if (akeavinn('registrations',$info))
$instrow['RegOpen']=b2i($info['registrations'],'Istanza «'.$instrow['URI'].'»: ');
if (akeavinn('approval_required',$info))
$instrow['RegReqApproval']=b2i($info['approval_required'],'Istanza «'.$instrow['URI'].'»: ');
if (akeavinn('contact_account',$info)) {
if (akeavinn('acct',$info['contact_account']))
$instrow['AdmAccount']=nempty(truncs($info['contact_account']['acct'],'Instances','AdmAccount','«'.$instrow['URI'].'»'));
if (akeavinn('display_name',$info['contact_account']))
$instrow['AdmDisplayName']=nempty(truncs($info['contact_account']['display_name'],'Instances','AdmDisplayName','«'.$instrow['URI'].'»'));
if (akeavinn('created_at',$info['contact_account']))
$instrow['AdmCreatedAt']=pgdatetomy($info['contact_account']['created_at']);
if (akeavinn('note',$info['contact_account']))
$instrow['AdmNote']=nempty(truncs($info['contact_account']['note'],'Instances','AdmNote','«'.$instrow['URI'].'»'));
if (akeavinn('url',$info['contact_account']))
$instrow['AdmURL']=nempty(truncs($info['contact_account']['url'],'Instances','AdmURL','«'.$instrow['URI'].'»'));
if (akeavinn('avatar',$info['contact_account']))
$instrow['AdmAvatar']=nempty(truncs($info['contact_account']['avatar'],'Instances','AdmAvatar','«'.$instrow['URI'].'»'));
if (akeavinn('header',$info['contact_account']))
$instrow['AdmHeader']=nempty(truncs($info['contact_account']['header'],'Instances','AdmHeader','«'.$instrow['URI'].'»'));
}
if (akeavinn('x-nodeinfo',$info)) {
if (akeavinn('software',$info['x-nodeinfo']) && akeavinn('name',$info['x-nodeinfo']['software']))
$instrow['Software']=nempty(truncs($info['x-nodeinfo']['software']['name'],'Instances','Software','«'.$instrow['URI'].'»'));
if (akeavinn('usage',$info['x-nodeinfo']) && akeavinn('users',$info['x-nodeinfo']['usage'])) {
if (akeavinn('activeMonth',$info['x-nodeinfo']['usage']['users']))
$instrow['ActiveUsersMonth']=truncn($info['x-nodeinfo']['usage']['users']['activeMonth'],'Instances','ActiveUsersMonth','«'.$instrow['URI'].'»');
if (akeavinn('activeHalfyear',$info['x-nodeinfo']['usage']['users']))
$instrow['ActiveUsersHalfYear']=truncn($info['x-nodeinfo']['usage']['users']['activeHalfyear'],'Instances','ActiveUsersHalfYear','«'.$instrow['URI'].'»');
}
}
$whynot=array();
if (is_null($instrow['RegOpen'])) {
$whynot[]='we dont know if it allows registrations';
} elseif ($instrow['RegOpen']==0) {
$whynot[]='it doesnt allow registrations';
}
if (is_null($instrow['UserCount'])) {
$whynot[]='we dont know its total users number';
} elseif ($instrow['UserCount']<10 || $instrow['UserCount']>30000) {
$whynot[]='total users number is not greater than 10 and less than 30000';
}
if (is_null($instrow['DomainCount'])) {
$whynot[]='we dont know the number of other instances it knows';
} elseif ($instrow['DomainCount']<500) {
$whynot[]='the number of other instances it knows is less than 500';
}
if (!is_null($instrow['ActiveUsersMonth'])) {
if ($instrow['ActiveUsersMonth']<10)
$whynot[]='the number of active users for the last month is less than 10';
} elseif (!is_null($instrow['StatusCount']) && $instrow['UserCount']>0 && $instrow['StatusCount']/$instrow['UserCount']<10) {
$whynot[]='the average number of toots for user is less than 10';
} else {
$whynot[]='it was impossible to detect the number of active users for the last month or the average number of toots for user';
}
if (count($whynot)==0) {
$instrow['Good']=1;
eecho(1,'«'.$host.'»: this is a suitable instance! :-)'.N);
$qgood++;
} else {
eecho(1,'«'.$host.'»: this is not a suitable instance: '.implode('; ',$whynot).' :-('.N);
}
$res=myq($link,'SELECT * FROM Instances WHERE URI=\''.myesc($link,$instrow['URI']).'\'',__LINE__);
$nrows=mysqli_num_rows($res);
if ($nrows==1) {
eecho(1,'«'.$instrow['URI'].'»: is already present in the database, updating it...'.N);
$oldinstrow=mysqli_fetch_assoc($res);
$instid=$oldinstrow['ID'];
$instrow['ID']=$oldinstrow['ID'];
// if the instance already present in the db has FirstSeen=NULL, this means this is the first time it responds, so...
if (is_null($oldinstrow['FirstSeen'])) {
$instrow['FirstSeen']=$now;
$instrow['New']=1;
} else {
$instrow['FirstSeen']=$oldinstrow['FirstSeen'];
if ($oldinstrow['New']==1) {
$instrow['New']=1;
if ($now-$oldinstrow['FirstSeen']>$opts['oldline']) {
$instrow['New']=0;
notify('Instance «<a href="viewinst.php?id='.$instrow['ID'].'">'.$instrow['URI'].'</a>» is no longer new.',2);
}
}
}
if ($instrow['Good']==1 && $oldinstrow['Good']==0) {
notify('Instance «<a href="viewinst.php?id='.$instrow['ID'].'">'.$instrow['URI'].'</a>» wasnt suitable, but it is now!',1);
} elseif ($instrow['Good']==0 && $oldinstrow['Good']==1) {
notify('Instance «<a href="viewinst.php?id='.$instrow['ID'].'">'.$instrow['URI'].'</a>» was suitable, but its no longer for these reasons: '.implode('; ',$whynot),1);
}
$instrow['Chosen']=$oldinstrow['Chosen'];
$instrow['Priority']=$oldinstrow['Priority'];
$instrow['Visible']=$oldinstrow['Visible'];
$instrow['Noxious']=$oldinstrow['Noxious'];
$instrow['NoxReason']=$oldinstrow['NoxReason'];
$instrow['NoxLastModTS']=$oldinstrow['NoxLastModTS'];
if ($instrow['ShortDesc']!=$oldinstrow['ShortDesc'])
notify('«Short description» of instance «<a href="viewinst.php?id='.$instrow['ID'].'">'.$instrow['URI'].'</a>» has changed.',1);
if ($instrow['LongDesc']!=$oldinstrow['LongDesc'])
notify('«Long description» of instance «<a href="viewinst.php?id='.$instrow['ID'].'">'.$instrow['URI'].'</a>» has changed.',1);
$instrow['OurDesc']=$oldinstrow['OurDesc'];
$instrow['OurDescEN']=$oldinstrow['OurDescEN'];
$instrow['LocalityID']=$oldinstrow['LocalityID'];
$instrow['OurLangsLock']=$oldinstrow['OurLangsLock'];
$instrow['GuestID']=$oldinstrow['GuestID'];
$instrow['LastGuestEdit']=$oldinstrow['LastGuestEdit'];
$query='UPDATE Instances SET ';
foreach ($instrow as $field=>$value) {
if (!is_null($value))
$query.=$field.'=\''.myesc($link,$value).'\', ';
else
$query.=$field.'=NULL, ';
}
$query=substr($query,0,-2).' WHERE Instances.ID='.$instrow['ID'];
eecho(1,'«'.$host.'»: update query: «'.$query.'».'.N);
if (!$opts['dryrun']) myq($link,$query,__LINE__);
$res=myq($link,'SELECT InstID, LangID, Pos, Code FROM InstLangs LEFT JOIN Languages ON Languages.ID=LangID WHERE InstID='.$instrow['ID'].' ORDER BY Pos ASC',__LINE__);
$oldinstlangs=array();
while ($row=mysqli_fetch_assoc($res))
$oldinstlangs[]=$row;
$instlangs=langs($instrow['ID'], $instrow['URI'], false);
if ($instlangs!=$oldinstlangs) {
notify('The list of languages declared by instance «<a href="viewinst.php?id='.$instrow['ID'].'">'.$instrow['URI'].'</a>» has changed from «'.subarimp(', ','Code',$oldinstlangs).'» to «'.subarimp(', ','Code',$instlangs).'».',1);
if (!$opts['dryrun']) {
myq($link,'DELETE FROM InstLangs WHERE InstID='.$instrow['ID'],__LINE__);
foreach ($instlangs as $row)
myq($link,'INSERT INTO InstLangs (InstID, LangID, Pos) VALUES ('.$row['InstID'].', '.$row['LangID'].', '.$row['Pos'].')',__LINE__);
}
}
if ($instrow['OurLangsLock']==0) {
$instourlangs=langs($instrow['ID'], $instrow['URI'], true);
// if instourlangs is empty and instlangs is not, set instourlangs as instlangs
if (count($instourlangs)==0 && count($instlangs)>0)
$instourlangs=$instlangs;
if (count($instourlangs)>0) {
if (!$opts['dryrun']) {
myq($link,'DELETE FROM InstOurLangs WHERE InstID='.$instrow['ID'],__LINE__);
foreach ($instourlangs as $row)
myq($link,'INSERT INTO InstOurLangs (InstID, OurLangID, Pos) VALUES ('.$row['InstID'].', '.$row['LangID'].', '.$row['Pos'].')',__LINE__);
}
}
}
} elseif ($nrows==0) {
eecho(1,'«'.$host.'» is not present in the database, adding it...'.N);
$instrow['FirstSeen']=$now;
if ($opts['setnew'])
$instrow['New']=1;
$fields=array();
$values='';
foreach ($instrow as $field=>$value) {
$fields[]=$field;
if (!is_null($value))
$values.='\''.myesc($link,$value).'\', ';
else
$values.='NULL, ';
}
$values=substr($values,0,-2);
$query='INSERT INTO Instances ('.implode(', ',$fields).', InsertTS) VALUES ('.$values.', '.$now.')';
eecho(1,'«'.$host.'»: insert query: «'.$query.'»'.N);
if (!$opts['dryrun']) {
myq($link,$query,__LINE__);
$instid=mysqli_insert_id($link);
} else {
$instid=0;
}
if ($opts['setnew'] && !$opts['dryrun'])
notify('New instance found: «<a href="viewinst.php?id='.$instid.'">'.$instrow['URI'].'</a>».',1);
$instlangs=langs($instid, $instrow['URI'], false);
if (!$opts['dryrun']) {
foreach ($instlangs as $row)
myq($link,'INSERT INTO InstLangs (InstID, LangID, Pos) VALUES ('.$row['InstID'].', '.$row['LangID'].', '.$row['Pos'].')',__LINE__);
}
$instourlangs=langs($instid, $instrow['URI'], true);
// if instourlangs is empty and instlangs is not, set instourlangs as instlangs
if (count($instourlangs)==0 && count($instlangs)>0)
$instourlangs=$instlangs;
if (!$opts['dryrun']) {
foreach ($instourlangs as $row)
myq($link,'INSERT INTO InstOurLangs (InstID, OurLangID, Pos) VALUES ('.$row['InstID'].', '.$row['LangID'].', '.$row['Pos'].')',__LINE__);
}
if ($instrow['Good']==1)
notify('New instance «<a href="viewinst.php?id='.$instid.'">'.$instrow['URI'].'</a>» is suitable!',1);
} else {
notify('Instance «'.$host.'» has '.$nrows.' entries in «Instances» table!',3);
}
if (array_key_exists('x-activity',$info) && is_array($info['x-activity'])) {
if (!$opts['dryrun']) {
myq($link,'DELETE FROM InstActivity WHERE InstID='.$instid,__LINE__);
$pos=0;
foreach ($info['x-activity'] as $buf) {
if (akeavinn('week',$buf) && akeavinn('statuses',$buf) && akeavinn('logins',$buf) && akeavinn('registrations',$buf)) {
$pos++;
$query='INSERT INTO InstActivity (InstID, Week, Statuses, Logins, Registrations, Pos) VALUES (\''.$instid.'\', \''.myesc($link,$buf['week']).'\', \''.myesc($link,$buf['statuses']).'\', \''.myesc($link,$buf['logins']).'\', \''.myesc($link,$buf['registrations']).'\', '.$pos.')';
myq($link,$query,__LINE__);
}
}
}
}
if (array_key_exists('x-trends',$info) && is_array($info['x-trends'])) {
$trends=array();
foreach ($info['x-trends'] as $buf) {
if (akeavinn('name',$buf) && akeavinn('url',$buf) && akeavinn('history',$buf) && is_array($buf['history'])) {
$trend=0;
foreach ($buf['history'] as $row) {
if ($row['uses']>0)
$trend+=($row['accounts']/$row['uses']);
}
$trends[]=array(
'InstID'=>$instid,
'LastDay'=>$buf['history'][0]['day'],
'Name'=>$buf['name'],
'URL'=>$buf['url'],
'Pos'=>null,
'trend'=>$trend
);
}
}
mdasortbykey($trends,'trend',true);
//print_r($trends);
if (!$opts['dryrun']) myq($link,'DELETE FROM InstTrends WHERE InstID='.$instid,__LINE__);
$pos=0;
foreach ($trends as $trend) {
$pos++;
$query='INSERT INTO InstTrends (InstID, LastDay, Name, URL, Pos) VALUES ('.$trend['InstID'].', \''.$trend['LastDay'].'\', \''.myesc($link,truncs($trend['Name'],'InstTrends','Name','«'.$instrow['URI'].'»')).'\', \''.myesc($link,truncs($trend['URL'],'InstTrends','URL','«'.$instrow['URI'].'»')).'\', '.$pos.')';
if (!$opts['dryrun']) myq($link,$query,__LINE__);
}
}
if (!$opts['dryrun']) myq($link,'INSERT INTO InstChecks (InstID, Time, Status) VALUES ('.$instid.', '.$now.', 1)',__LINE__);
if ($opts['fetchusers'] && $ismast && array_key_exists('version',$info) && $info['version']>='4.0.0') {
eecho(0,'«'.$host.'»: trying to fetch users info from directory API...'.N);
$exusers=[];// array of this instance's users already existing in the db
$res=myq($link,'SELECT ID, locid, username FROM Users WHERE InstID='.$instid,__LINE__);
while ($row=mysqli_fetch_assoc($res)) $exusers[$row['locid']]=$row;
$users=[];// array of users in this instance's directory
$chunk=0;
$limit=80;
$end=false;
while (!$end) {
$offset=$chunk*$limit;
$buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
eecho(1,'«'.$host.'»: got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N);
$buf=@json_decode($buf['cont'],true);
if (is_array($buf)) {
//print_r($buf);
if (count($buf)<$limit) $end=true;
/*if (count($buf)>0 && !array_key_exists('noindex',$buf[0])) {
eecho(2,'«'.$host.'»: account entities reported by directory api endpoint dont have a “noindex” attribute; skipping directory fetching.'.N);
break;
} else {
eecho(0,'«'.$host.'»: account entities reported by directory api endpoint do have a “noindex” attribute; continuing with directory fetching.'.N);
}*/
//foreach ($buf as $user) echo($user['username'].' '); echo(N.N);
foreach ($buf as $user) {
if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields', 'noindex'], $user)) {
eecho(0,'«'.$host.'» ('.$i.'/'.$cinsts.'): working on user «'.$user['username'].'»...'.N);
// disabled because it takes too long on instances with many users
/*if (!isset($user['noindex'])) {
$user['noindex']=true;
eecho(0,'«'.$host.'»: «'.$user['username'].'»: «noindex» is undefined, trying to define it by fetching users profile page...'.N);
$page=getfc($user['url'],$opts['timeout']);
// here ckratelimit is not needed because it's a normal web page, not json from mastodon api
if ($page['cont']!==false) {
//<meta content='noindex, noarchive' name='robots'>
if (preg_match('/<meta\s+content=[\'"](noindex|noarchive)/ui',$page['cont'])!==1) {
$user['noindex']=false;
eecho(0,'«'.$user['url'].'»: «noindex» is not set.'.N);
} else {
eecho(0,'«'.$user['url'].'»: «noindex» is set.'.N);
}
} else {
eecho(2,'«'.$host.'»: could not fetch «'.$user['url'].'»: '.$page['emsg'].N);
}
}*/
$snote=strip_tags($user['note']);
if (preg_match('/(?<!\w)#(nobots?|noindex)(?!\w)/iu',$snote)===1) $user['noindex']=true;
if (preg_match('/(?<!\w)#(okindex|yesindex|doindex|okmhindex)(?!\w)/iu',$snote)===1) $user['noindex']=false;
// disabled; takes too long on instances with many users
/*$user['tags']=[];
if (!$user['noindex'] && $info['version']>='3.3.0') {
eecho(0,'«'.$host.'»: trying to fetch tags for user «'.$user['username'].'»...'.N);
$tags=@getfc('https://'.$host.'/api/v1/accounts/'.$user['id'].'/featured_tags',$opts['timeout']);
if ($tags['cont']!==false) {
ckratelimit($tags['headers']);
$tags=@json_decode($tags['cont'],true);
if (is_array($tags) && count($tags)>0) {
eecho(1,'«'.$host.'»: got '.count($tags).' tag(s) for user «'.$user['username'].'» :-)'.N);
foreach($tags as $tag) $user['tags'][]=$tag['name'];
}
} else {
eecho(2,'«'.$host.'»: could not fetch tags for user «'.$user['username'].'» :-( ('.$tags['emsg'].').'.N);
}
}
$user['tags']=implode(';',$user['tags']);
if ($user['tags']=='') $user['tags']=null;*/
$user['tags']=null;
if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']);
if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']);
$users[$user['id']]=$user;
} else {
eecho(2,'«'.$host.'»: user record missed some required keys :-('.N);
//print_r($user);
}
}
} else {
eecho(2,'«'.$host.'»: ... but the chunk was not good JSON :-('.N);
$end=true;
}
$chunk++;
} else {
eecho(2,'«'.$host.'»: could not fetch users info from directory API: '.$buf['emsg'].N);
$end=true;
}
}
foreach ($users as $locid=>$user) {
$query='SET InstID='.$instid.', host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,truncs($user['username'], 'Users', 'username', '«'.$host.'»: «'.$user['username'].'»')).', display_name='.myv($link,truncs($user['display_name'], 'Users', 'display_name', '«'.$host.'»: «'.$user['username'].'»')).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,truncs($user['note'], 'Users', 'note', '«'.$host.'»: «'.$user['username'].'»')).', url='.myv($link,truncs($user['url'], 'Users', 'url', '«'.$host.'»: «'.$user['username'].'»')).', avatar='.myv($link,truncs($user['avatar'], 'Users', 'avatar', '«'.$host.'»: «'.$user['username'].'»')).', header='.myv($link,truncs($user['header'], 'Users', 'header', '«'.$host.'»: «'.$user['username'].'»')).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', tags='.myv($link,truncs($user['tags'], 'Users', 'tags', '«'.$host.'»: «'.$user['username'].'»'));
$uid=0;
if (!array_key_exists($user['id'],$exusers)) {
if (!$user['noindex']) {
eecho(0,'«'.$host.'»: inserting new user «'.$user['username'].'»...'.N);
$query='INSERT INTO Users '.$query;
if (!$opts['dryrun']) {
myq($link,$query,__LINE__);
$uid=mysqli_insert_id($link);
}
} else {
eecho(0,'«'.$host.'»: NOT inserting user «'.$user['username'].'» because they dont want to be indexed...'.N);
}
} else {
$uid=$exusers[$locid]['ID'];
if (!$user['noindex']) {
eecho(0,'«'.$host.'»: updating existing user «'.$user['username'].'» ('.$uid.')...'.N);
$query='UPDATE Users '.$query.' WHERE ID='.$uid;
} else {
eecho(0,'«'.$host.'»: deleting existing user «'.$user['username'].'» ('.$uid.') because they dont want to be indexed...'.N);
$query='DELETE FROM Users WHERE ID='.$uid;
}
if (!$opts['dryrun']) {
myq($link,$query,__LINE__);
myq($link,'DELETE FROM UsersFields WHERE UserID='.$uid,__LINE__);
}
}
if ($uid!=0 && !$user['noindex'] && is_array($user['fields']) && count($user['fields'])>0) {
eecho(0,'«'.$host.'»: saving user fields for user «'.$user['username'].'» ('.$uid.')...'.N);
foreach ($user['fields'] as $field) {
(is_null($field['verified_at'])) ? $field['verified_at']=0 : $field['verified_at']=1;
$field['name']=truncs($field['name'],'UsersFields','name','«'.$host.'»: «'.$user['username'].'»');
$field['value']=truncs($field['value'],'UsersFields','value','«'.$host.'»: «'.$user['username'].'»');
if (!$opts['dryrun']) myq($link,'INSERT INTO UsersFields SET UserID='.$uid.', name='.myv($link,$field['name']).', value='.myv($link,$field['value']).', verified='.$field['verified_at'],__LINE__);
}
}
}
foreach ($exusers as $locid=>$exuser) {
if (!array_key_exists($locid,$users)) {
eecho(0,'«'.$host.'»: user «'.$exusers[$locid]['username'].'» opted out of the directory, deleting their record ('.$exuser['ID'].')...'.N);
if (!$opts['dryrun']) {
myq($link,'DELETE FROM Users WHERE ID='.$exuser['ID'],__LINE__);
myq($link,'DELETE FROM UsersFields WHERE UserID='.$exuser['ID'],__LINE__);
}
}
}
}
}
}
}
mysqli_close($link);
unset($link);
if ($opts['jsonwrite']) {
fwrite($jsonf,'"The end?": true'.N.'}'.N);
fclose($jsonf);
}
unlink($instsjfp);
unlink($currinstjfp);
unlink($lockfp);
eecho(1,'Done (in '.ght(time()-$tini,null,0).') :-)'.N);
exit(0);
// "multi array_key_exists"
function make($keys,&$arr) {
foreach ($keys as $key)
if (!array_key_exists($key,$arr))
return(false);
return(true);
}
function myv(&$link,$var) {
if (is_null($var)) {
return('NULL');
} elseif (is_bool($var)) {
if ($var)
return('1');
else
return('0');
} elseif (trim($var)=='') {
return('NULL');
} else {
return('\''.mysqli_real_escape_string($link,$var).'\'');
}
}
function datetomy($date) {
$date=explode('-',$date);
return(mktime(0,0,0,$date[1],$date[2],$date[0]));
}
function ckratelimit($httpresphead) {
$headers=explode("\r\n",$httpresphead);
$buff=[];
array_shift($headers);
foreach ($headers as $header)
if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1)
$buff[strtolower($matches[1])]=trim($matches[2]);
$headers=$buff;
if (array_key_exists('x-ratelimit-reset',$headers)) {
if (array_key_exists('date',$headers)) {
//Wed, 30 Mar 2022 21:27:22 GMT
$srvnow=strtotime($headers['date']);
//2022-03-31T04:05:00.058705Z
$srvrlr=strtotime($headers['x-ratelimit-reset']);
$stosl=$srvrlr-$srvnow+1;
//echo('ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N);
if ($headers['x-ratelimit-remaining']<3) {
eecho(2,'reached rate limit, sleeping for '.$stosl.' seconds ...'.N);
sleep($stosl);
}
} else {
eecho(2,'ckratelimit: $httpresphead did not contain a «date» header!'.N);
}
} else {
eecho(2,'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!'.N);
}
}
?>