#!/usr/bin/php . */ define('N',"\n"); define('SNAME',basename(__FILE__)); define('BNAME',preg_replace('/\.[^.]*$/','',SNAME)); require(__DIR__.'/../site/mustard/include/gurl.php'); require(__DIR__.'/../site/mustard/include/ghs.php'); require(__DIR__.'/../site/mustard/include/ght.php'); use function mysqli_real_escape_string as myesc; $opts=[ 'inifp'=>__DIR__.'/../conf/mustard.ini', 'startinst'=>'mastodon.social', 'deadline'=>62*24*60*60, 'peersfp'=>__DIR__.'/peers', 'apeersfp'=>__DIR__.'/peers.all', 'cpeersfp'=>__DIR__.'/peers.checked', 'excludefp'=>null, 'timeout'=>8, 'curltimeout'=>15, 'loop'=>false, 'verbose'=>false, 'excludedead'=>false, 'ignorelock'=>false ]; $help='SYNOPSIS '.SNAME.' [options] DESCRIPTION This program tries to build a fairly complete list of fediverse instances exposing the [instance]/api/v1/instance/peers endpoint. OPTIONS -s, --startinst Defines the first instance to crawl. DEFAULT: «'.$opts['startinst'].'» -p, --peersfp Defines the file into which the ordered list of responding instances will be saved. DEFAULT: «'.$opts['peersfp'].'» -a, --apeersfp Defines the file into which the ordered list of all instances will be saved. DEFAULT: «'.$opts['apeersfp'].'» -c, --cpeersfp Defines the file into which the ordered list of all checked instances will be saved. DEFAULT: «'.$opts['cpeersfp'].'» -I, --ignorelock Normally, if its lockfile exists, the program exits with an error before doing anything. With this option the lockfile is ignored. Please verify that the program is not already running before using it. -e, --excludefp Defines a file containing exclusion rules: one regular expression per line (empty lines are ignored). Any instance matching any defined regex will be ignored by the program. Changes made to this file during program execution will be taken into account. -E, --excludedead Exclude instances marked as “Dead” in the database. -l, --loop Normally the script will exit after completing a crawl; if this option is set, it will restart crawling until it receives a SIGTERM, SIGHUP or SIGINT. -t, --timeout Defines the timeout in seconds for every connection attempt. DEFAULT: «'.$opts['timeout'].'» -T, --curltimeout Defines the timeout in seconds for every download. DEFAULT: «'.$opts['curltimeout'].'» -v, --verbose Be more verbose. -h, --help Show this help text and exit. This program comes with ABSOLUTELY NO WARRANTY; for details see the source. This is free software, and you are welcome to redistribute it under certain conditions; see for details.'.N; for ($i=1; $i<$argc; $i++) { if ($argv[$i]=='-s' || $argv[$i]=='--startinst') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1); $i++; $opts['startinst']=$argv[$i]; } elseif ($argv[$i]=='-p' || $argv[$i]=='--peersfp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['peersfp']=$argv[$i]; } elseif ($argv[$i]=='-a' || $argv[$i]=='--apeersfp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['apeersfp']=$argv[$i]; } elseif ($argv[$i]=='-c' || $argv[$i]=='--cpeersfp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['cpeersfp']=$argv[$i]; } elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') { $opts['ignorelock']=true; } elseif ($argv[$i]=='-e' || $argv[$i]=='--excludefp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['excludefp']=$argv[$i]; } elseif ($argv[$i]=='-t' || $argv[$i]=='--timeout') { if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1) mexit('Error: option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1); $i++; $opts['timeout']=$argv[$i]+0; } elseif ($argv[$i]=='-T' || $argv[$i]=='--curltimeout') { if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1) mexit('Error: option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1); $i++; $opts['curltimeout']=$argv[$i]+0; } elseif ($argv[$i]=='-E' || $argv[$i]=='--excludedead') { $opts['excludedead']=true; } elseif ($argv[$i]=='-l' || $argv[$i]=='--loop') { $opts['loop']=true; } elseif ($argv[$i]=='-v' || $argv[$i]=='--verbose') { $opts['verbose']=true; } elseif ($argv[$i]=='-h' || $argv[$i]=='--help') { mexit($help,0); } else { mexit('Error: don’t know how to interpret «'.$argv[$i].'» (use «-h» to read the help text).'.N,1); } } $lockfp=__DIR__.'/'.BNAME.'.lock'; if (is_file($lockfp) && !$opts['ignorelock']) { gecho('Error: lockfile exists: it seems the program is already running; if you’re sure it’s not, you can use «-I» to force execution.'.N,false,true); exit(1); } if (@touch($lockfp)===false) mexit('Error: could not create lockfile «'.$lockfp.'».'.N,false,true); //declare(ticks=1); pcntl_async_signals(true); pcntl_signal(SIGTERM,'sighandler');// Termination ('kill' was called) pcntl_signal(SIGHUP,'sighandler');// Terminal log-out pcntl_signal(SIGINT,'sighandler');// Interrupted (Ctrl-C is pressed) $iniarr=@parse_ini_file($opts['inifp']) or mexit('Error: couldn’t open «'.$opts['inifp'].'».'.N,1); try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); } catch (Exception $error) { mexit('Error: couldn’t connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); } // for php versions < 8 if ($link===false) mexit('Error: couldn’t connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); try { $res=mysqli_set_charset($link,'utf8mb4'); } catch (Exception $error) { mexit('Error: couldn’t set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).'.'.N,1,true); } // for php versions < 8 if ($res===false) mexit('Error: couldn’t set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,1,true); $deadinsts=[]; if ($opts['excludedead']) { $res=myq($link,'SELECT URI FROM Instances WHERE Dead=1'); while ($row=mysqli_fetch_assoc($res)) if (!in_array($row['URI'],$deadinsts)) $deadinsts[]=$row['URI']; $res=myq($link,'SELECT Hostname FROM Peers WHERE IsDead=1'); while ($row=mysqli_fetch_assoc($res)) if (!in_array($row['Hostname'],$deadinsts)) $deadinsts[]=$row['URI']; unset($res,$row); gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false); } $insts=[]; $cinsts=[]; $ainsts=[]; $exarr=[]; $notifs=[]; $cloop=0; do { $peersf=@fopen($opts['peersfp'],'w'); if ($peersf===false) mexit('Error: couldn’t open «'.$opts['peersfp'].'» in write mode.'.N,1); $apeersf=@fopen($opts['apeersfp'],'w'); if ($apeersf===false) mexit('Error: couldn’t open «'.$opts['apeersfp'].'» in write mode.'.N,1); $cpeersf=@fopen($opts['cpeersfp'],'w'); if ($cpeersf===false) mexit('Error: couldn’t open «'.$opts['cpeersfp'].'» in write mode.'.N,1); $cloop++; $maxround=1; $newc=0; $tini=time(); // go crawl([$opts['startinst']],1); gecho('Done crawling! :-)'.N,true,false); $now=time(); gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false); gecho(count($ainsts).' URIs checked in '.ght($now-$tini).', '.$maxround.' rounds; '.count($insts).' responded; found '.$newc.' new instances; max. memory usage: '.ghs(memory_get_peak_usage(true)).'.'.N,true,false); gecho('Loop(s): '.$cloop.N,true,false); sleep(1); fclose($peersf); fclose($cpeersf); fclose($apeersf); sortcheckandsave($insts,'list of responding instances',$opts['peersfp']); sortcheckandsave($cinsts,'list of checked instances',$opts['cpeersfp']); sortcheckandsave($ainsts,'list of all instances',$opts['apeersfp']); $insts=[]; $cinsts=[]; $ainsts=[]; } while ($opts['loop']); mysqli_close($link); unlink($lockfp); exit(0); // functions function crawl($list,$id) { global $insts, $deadinsts, $cinsts, $ainsts, $tini, $opts, $peersf, $cpeersf, $apeersf, $maxround, $newc, $link; gecho('###### START OF ROUND '.$id.' ######'.N,true,false); $nlist=[]; $c=count($list); $i=0; $rtini=time(); foreach ($list as $inst) { if (!in_array($inst,$ainsts)) { $ainsts[]=$inst; fwrite($apeersf,$inst.N); } $i++; $now=time(); $rtela=$now-$rtini; gecho('Working on «'.$inst.'»: round '.$id.', '.$i.'/'.$c.'; TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' discovered instances; '.count($nlist).' instances in next round list; '.$newc.' new instances found.'.N,true,false); waituntilonline(); updexarr(); gecho('Trying to load «'.$inst.'»’s peers...'.N,true,false); $peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout'],$opts['curltimeout']); $cinsts[]=$inst;// don't need to check if in_array fwrite($cpeersf,$inst.N); $responded=0; if ($peers['cont']===false) { gecho('Error loading «'.$inst.'»’s peers: '.$peers['emsg'].'.'.N,true,true); } else { $peers=@json_decode($peers['cont'],true); if (!is_array($peers)) { gecho('Error loading «'.$inst.'»’s peers: got not good JSON.'.N,true,true); } else { gecho('Successfully loaded «'.$inst.'»’s peers :-)'.N,true,false); $responded=1; if (!in_array($inst,$insts)) { gecho('Instance «'.$inst.'» responded :-)'.N,true,false); $insts[]=$inst; fwrite($peersf,$inst.N); $res=myq($link,'SELECT ID FROM Instances WHERE URI=\''.myesc($link,$inst).'\''); if (mysqli_num_rows($res)==0) { gecho('Instance «'.$inst.'» is new :-)'.N,true,false); myq($link,'INSERT INTO Instances SET URI=\''.myesc($link,$inst).'\', InsertTS='.time()); } } foreach ($peers as $peer) { if (!in_array($peer,$ainsts)) { $ainsts[]=$peer; fwrite($apeersf,$peer.N); } $whynot=[]; if (in_array($peer,$cinsts)) $whynot[]='it has already been checked'; if (!is_string($peer)) $whynot[]='its name is not a string'; if (!validhostname($peer)) $whynot[]='its name is not a valid hostname'; if (ckexarr($peer)) $whynot[]='its name matches an exclusion regexp'; if (in_array($peer,$list)) $whynot[]='it is already present in current list'; if (in_array($peer,$nlist)) $whynot[]='it has already been added to next round list'; if ($opts['excludedead'] && in_array($peer,$deadinsts)) $whynot[]='it’s dead'; if (count($whynot)>0) { if ($opts['verbose']) gecho(' Not adding peer «'.$peer.'» to next round list because '.implode(', ',$whynot).'.'.N,true,true); } else { if ($opts['verbose']) gecho(' Adding peer «'.$peer.'» to next round list :-)'.N,true,false); $nlist[]=$peer; } } } } $instid=0; $res=myq($link,'SELECT * FROM Peers WHERE Hostname=\''.myesc($link,$inst).'\''); $nrows=mysqli_num_rows($res); if ($nrows>0) { if ($nrows>1) gecho('«'.$inst.'» has '.$nrows.' records in Peers table! :-('.N,true,true); $row=mysqli_fetch_assoc($res); $instid=$row['ID']; $dead=0; if (!$responded) { // we check the last time instance responded, if ever $res=myq($link,'SELECT Time FROM PeersChecks WHERE InstID='.$instid.' AND Status=1 ORDER BY Time DESC LIMIT 1',__LINE__); // if instance never responded we consider the time of first check if (mysqli_num_rows($res)==0) $res=myq($link,'SELECT Time FROM PeersChecks WHERE InstID='.$instid.' ORDER BY Time ASC LIMIT 1',__LINE__); if (mysqli_num_rows($res)>0) { $row=mysqli_fetch_assoc($res); if ($now-$row['Time']>$opts['deadline']) { $dead=1; gecho('«'.$instid.'» just died!',true,true); $deadinsts[]=$inst; } } else { gecho('«'.$inst.'» exists in Peers table but there’s no data about it in PeersChecks!'.N,true,true); } } $query='UPDATE Peers SET Hostname=\''.myesc($link,$inst).'\', IsDead='.$dead.' WHERE ID='.$instid; } else {// not in Peers table ($nrows==0) $query='INSERT INTO Peers SET Hostname=\''.myesc($link,$inst).'\', IsDead=0'; } myq($link,$query); if ($instid==0) $instid=mysqli_insert_id($link); myq($link,'INSERT INTO PeersChecks SET InstID='.$instid.', Time='.$now.', Status='.$responded); } if (count($nlist)>0) { unset($list); crawl($nlist,$id+1); $maxround=$id+1; } else { gecho('Next round list is empty.'.N,true,false); } gecho('###### END OF ROUND '.$id.' ######'.N,true,false); } function mexit($msg,$code) { global $link, $peersf, $cpeersf, $apeersf, $lockfp; if (isset($link) && $link!==false) mysqli_close($link); if (isset($peersf) && $peersf!==false) @fclose($peersf); if (isset($cpeersf) && $cpeersf!==false) @fclose($cpeersf); if (isset($apeersf) && $apeersf!==false) @fclose($apeersf); if (isset($lockfp) && is_file($lockfp)) unlink($lockfp); if ($code==0) echo($msg); else fwrite(STDERR,$msg); exit($code); } function gecho($msg,$prtime,$iserr) { if ($prtime) $msg=microdate().' '.$msg; if ($iserr) fwrite(STDERR,$msg); else echo($msg); } function myq(&$link,$query) { try { $res=mysqli_query($link,$query); } catch (Exception $error) { mexit('Error: query «'.$query.'» failed: '.$error->getMessage().' ('.$error->getCode().').'.N,2); } // for php versions < 8, which seem to not catch mysql exceptions if ($res===false) mexit('Error: query «'.$query.'» failed: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,2); return($res); } function microdate($time=null) { if (is_null($time)) $time=microtime(false); $time=explode(' ',$time); return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2)); } function sortcheckandsave(&$arr,$arrdesc,&$fp) { $buc=count($arr); $arr=array_unique($arr); $auc=count($arr); if ($buc!=$auc) gecho('Warning: '.$arrdesc.' contained duplicates, better check code ;-)'.N,true,true); gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false); sort($arr); $f=@fopen($fp,'w'); if ($f!==false) { foreach ($arr as $val) fwrite($f,$val.N); fclose($f); } else { gecho('Error: couldn’t open «'.$fp.'» for writing.'.N,true,true); } } function sighandler($signal) { echo(N); mexit('Interrupted (signal: '.$signal.').'.N,0); } function isempty($val) { if (preg_match('/^\s*$/',$val)===1) return(true); else return(false); } function waituntilonline() { $url='www.google.com'; $gotoff=false; while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) { $gotoff=true; gecho('Warning: it seems we are offline, waiting 10 seconds before retrying...'.N,true,true); sleep(10); } fclose($f); if ($gotoff) gecho('It seems we are back online! :-)'.N,true,false); } function updexarr() { global $exarr, $opts; if (!is_null($opts['excludefp'])) { $f=@fopen($opts['excludefp'],'r'); if ($f!==false) { $i=0; $exarr=[]; while (!feof($f)) { $i++; $line=trim(fgets($f)); if (!isempty($line)) { if (@preg_match($line,'foo')!==false) $exarr[]=$line; else gecho('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N,true,true); } } } else { gecho('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N,true,true); } } } function ckexarr($inst) { global $exarr; foreach ($exarr as $re) if (preg_match($re,$inst)===1) return(true); return(false); } function ismultibyte($s) { preg_replace('/./u','.',$s,-1,$c); (strlen($s)>$c) ? $r=true : $r=false; return($r); } function validhostname($hostname) { //$hostname=preg_replace('#/.*#','',$hostname); //$hostname=preg_replace('#:[0-9]+$#','',$hostname); if (ismultibyte($hostname)) $hostname=idn_to_ascii($hostname,IDNA_DEFAULT,INTL_IDNA_VARIANT_UTS46); //echo($hostname.N); if (strlen($hostname)>253) return(false); $labels=explode('.',$hostname); foreach($labels as $label) { $len=strlen($label); if ($len<1 || $len>63) return(false); if (preg_match('#^-#',$label)==1) return(false); if (preg_match('#-$#',$label)==1) return(false); //if (preg_match('#--#',$label)==1) return(false); if (preg_match('#^[a-zA-Z0-9-]+$#',$label)!==1) return(false); } return(true); } //$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die(); ?>