#!/usr/bin/php . */ define('N',"\n"); define('SNAME',basename(__FILE__)); define('BNAME',preg_replace('/\.[^.]*$/','',SNAME)); require(__DIR__.'/../site/mustard/include/gurl.php'); require(__DIR__.'/../site/mustard/include/ghs.php'); require(__DIR__.'/../site/mustard/include/ght.php'); use function mysqli_real_escape_string as myesc; $opts=[ 'inifp'=>__DIR__.'/../conf/mustard.ini', 'startinst'=>'mastodon.social', 'deadline'=>62*24*60*60, 'peersfp'=>__DIR__.'/peers', 'apeersfp'=>__DIR__.'/peers.all', 'cpeersfp'=>__DIR__.'/peers.checked', 'restore'=>false, 'excludefp'=>null, 'timeout'=>5, 'curltimeout'=>10, 'verbose'=>false, 'excludedead'=>false, 'ignorelock'=>false ]; $help='SYNOPSIS '.SNAME.' [options] DESCRIPTION This program tries to build a fairly complete list of fediverse instances exposing the [instance]/api/v1/instance/peers endpoint. OPTIONS -s, --startinst Defines the first instance to crawl. DEFAULT: «'.$opts['startinst'].'» -p, --peersfp Defines the file into which the ordered list of responding instances will be saved. DEFAULT: «'.$opts['peersfp'].'» -a, --apeersfp Defines the file into which the ordered list of all instances will be saved. DEFAULT: «'.$opts['apeersfp'].'» -c, --cpeersfp Defines the file into which the ordered list of all checked instances will be saved. DEFAULT: «'.$opts['cpeersfp'].'» -I, --ignorelock Normally, if its lockfile exists, the program exits with an error before doing anything. With this option the lockfile is ignored. Please verify that the program is not already running before using it. -r, --restore If peers file already exists on program’s start it will be loaded into memory and each instance it contains will be considered “already crawled”, thus allowing to “restore an interrupted crawling session”. -e, --excludefp Defines a file containing exclusion rules: one regular expression per line (empty lines are ignored). Any instance matching any defined regex will be ignored by the program. Changes made to this file during program execution will be taken into account. -t, --timeout Defines the timeout in seconds for every connection attempt. DEFAULT: «'.$opts['timeout'].'» -E, --excludedead Exclude instances marked as "Dead" in the database. -v, --verbose Be more verbose. -h, --help Show this help text and exit. This program comes with ABSOLUTELY NO WARRANTY; for details see the source. This is free software, and you are welcome to redistribute it under certain conditions; see for details.'.N; for ($i=1; $i<$argc; $i++) { if ($argv[$i]=='-s' || $argv[$i]=='--startinst') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1); $i++; $opts['startinst']=$argv[$i]; } elseif ($argv[$i]=='-p' || $argv[$i]=='--peersfp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['peersfp']=$argv[$i]; } elseif ($argv[$i]=='-a' || $argv[$i]=='--apeersfp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['apeersfp']=$argv[$i]; } elseif ($argv[$i]=='-c' || $argv[$i]=='--cpeersfp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['cpeersfp']=$argv[$i]; } elseif ($argv[$i]=='-r' || $argv[$i]=='--restore') { $opts['restore']=true; } elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') { $opts['ignorelock']=true; } elseif ($argv[$i]=='-e' || $argv[$i]=='--excludefp') { if ($i+1>=$argc) mexit('Error: option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['excludefp']=$argv[$i]; } elseif ($argv[$i]=='-t' || $argv[$i]=='--timeout') { if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1) mexit('Error: option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1); $i++; $opts['timeout']=$argv[$i]+0; } elseif ($argv[$i]=='-E' || $argv[$i]=='--excludedead') { $opts['excludedead']=true; } elseif ($argv[$i]=='-v' || $argv[$i]=='--verbose') { $opts['verbose']=true; } elseif ($argv[$i]=='-h' || $argv[$i]=='--help') { mexit($help,0); } else { mexit('Error: don’t know how to interpret «'.$argv[$i].'» (use «-h» to read the help text).'.N,1); } } $lockfp=__DIR__.'/'.BNAME.'.lock'; if (is_file($lockfp) && !$opts['ignorelock']) { gecho('Error: lockfile exists: it seems the program is already running; if you’re sure it’s not, you can use «-I» to force execution.'.N,false,true); exit(1); } if (@touch($lockfp)===false) mexit('Error: could not create lockfile «'.$lockfp.'».'.N,false,true); //declare(ticks=1); pcntl_async_signals(true); pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called) pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed) $iniarr=@parse_ini_file($opts['inifp']) or mexit('Error: couldn’t open «'.$opts['inifp'].'».'.N,1); try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); } catch (Exception $error) { mexit('Error: couldn’t connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); } // for php versions < 8 if ($link===false) mexit('Error: couldn’t connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); try { $res=mysqli_set_charset($link,'utf8mb4'); } catch (Exception $error) { mexit('Error: couldn’t set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).'.'.N,1,true); } // for php versions < 8 if ($res===false) mexit('Error: couldn’t set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,1,true); $deadinsts=[]; if ($opts['excludedead']) { $res=myq($link,'SELECT URI FROM Instances WHERE Dead=1'); while ($row=mysqli_fetch_assoc($res)) if (!in_array($row['URI'],$deadinsts)) $deadinsts[]=$row['URI']; $res=myq($link,'SELECT Hostname FROM Peers WHERE IsDead=1'); while ($row=mysqli_fetch_assoc($res)) if (!in_array($row['Hostname'],$deadinsts)) $deadinsts[]=$row['URI']; unset($res,$row); gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false); } $insts=[]; $ainsts=[]; $cinsts=[]; $exarr=[]; $mode=['mode'=>'w','desc'=>'write']; if ($opts['restore']) { $insts=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES); if ($insts===false) mexit('Error: couldn’t open «'.$opts['peersfp'].'» for reading.'.N,1); $ainsts=@file($opts['apeersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES); if ($ainsts===false) mexit('Error: couldn’t open «'.$opts['apeersfp'].'» for reading.'.N,1); $cinsts=@file($opts['cpeersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES); if ($cinsts===false) mexit('Error: couldn’t open «'.$opts['cpeersfp'].'» for reading.'.N,1); $mode=['mode'=>'a','desc'=>'append']; gecho('Succesfully restored previous session :-)'.N,true,false); } $peersf=@fopen($opts['peersfp'],$mode['mode']); if ($peersf===false) mexit('Error: couldn’t open «'.$opts['peersfp'].'» in '.$mode['desc'].' mode.'.N,1); $apeersf=@fopen($opts['apeersfp'],$mode['mode']); if ($apeersf===false) mexit('Error: couldn’t open «'.$opts['apeersfp'].'» in '.$mode['desc'].' mode.'.N,1); $cpeersf=@fopen($opts['cpeersfp'],$mode['mode']); if ($cpeersf===false) mexit('Error: couldn’t open «'.$opts['cpeersfp'].'» in '.$mode['desc'].' mode.'.N,1); $maxround=1; $tini=time(); crawl([$opts['startinst']],1); gecho('Done crawling! :-)'.N,true,false); mysqli_close($link); shutdown(true); $now=time(); gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false); gecho(count($ainsts).' URIs checked in '.ght($now-$tini).' ('.$maxround.' rounds); '.count($insts).' responded. Max. memory usage: '.ghs(memory_get_peak_usage(true)).'.'.N,true,false); exit(0); // functions function crawl($list,$id) { global $insts, $deadinsts, $cinsts, $ainsts, $tini, $opts, $peersf, $cpeersf, $apeersf, $maxround, $link; gecho('###### START OF ROUND '.$id.' ######'.N,true,false); $nlist=[]; $c=count($list); $i=0; $rtini=time(); foreach ($list as $inst) { if (!in_array($inst,$ainsts)) { $ainsts[]=$inst; fwrite($apeersf,$inst.N); } $i++; $now=time(); $rtela=$now-$rtini; gecho('Working on «'.$inst.'»: round '.$id.', '.$i.'/'.$c.'; TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' discovered instances; '.count($nlist).' instances in next round list.'.N,true,false); waituntilonline(); updexarr(); gecho('Trying to load «'.$inst.'»’s peers...'.N,true,false); $peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout'],$opts['curltimeout']); $cinsts[]=$inst;// don't need to chech if in_array fwrite($cpeersf,$inst.N); $responded=0; if ($peers['cont']===false) { gecho('Error loading «'.$inst.'»’s peers: '.$peers['emsg'].'.'.N,true,true); } else { $peers=@json_decode($peers['cont'],true); if (!is_array($peers)) { gecho('Error loading «'.$inst.'»’s peers: got not good JSON.'.N,true,true); } else { gecho('Successfully loaded «'.$inst.'»’s peers :-)'.N,true,false); $responded=1; if (!in_array($inst,$insts)) { gecho('Discovered instance «'.$inst.'» :-)'.N,true,false); $insts[]=$inst; fwrite($peersf,$inst.N); } foreach ($peers as $peer) { if (!in_array($peer,$ainsts)) { $ainsts[]=$peer; fwrite($apeersf,$peer.N); } $whynot=[]; if (in_array($peer,$cinsts)) $whynot[]='it has already been checked'; if (!is_string($peer)) $whynot[]='its name is not a string'; if (!validhostname($peer)) $whynot[]='its name is not a valid hostname'; if (ckexarr($peer)) $whynot[]='its name matches an exclusion regexp'; if (in_array($peer,$list)) $whynot[]='it is already present in current list'; if (in_array($peer,$nlist)) $whynot[]='it has already been added to next round list'; if ($opts['excludedead'] && in_array($peer,$deadinsts)) $whynot[]='it’s dead'; if (count($whynot)>0) { if ($opts['verbose']) gecho(' Not adding peer «'.$peer.'» to next round list because '.implode(', ',$whynot).'.'.N,true,true); } else { if ($opts['verbose']) gecho(' Adding peer «'.$peer.'» to next round list :-)'.N,true,false); $nlist[]=$peer; } } } } $instid=0; $res=myq($link,'SELECT * FROM Peers WHERE Hostname=\''.myesc($link,$inst).'\''); if (mysqli_num_rows($res)>0) { $row=mysqli_fetch_assoc($res); $instid=$row['ID']; $dead=0; if (!$responded) { // we check the last time instance responded, if ever $res=myq($link,'SELECT Time FROM PeersChecks WHERE InstID='.$instid.' AND Status=1 ORDER BY Time DESC LIMIT 1',__LINE__); // if instance never responded we consider the time of first check if (mysqli_num_rows($res)==0) $res=myq($link,'SELECT Time FROM PeersChecks WHERE InstID='.$instid.' ORDER BY Time ASC LIMIT 1',__LINE__); if (mysqli_num_rows($res)>0) { $row=mysqli_fetch_assoc($res); if ($now-$row['Time']>$opts['deadline']) { $dead=1; gecho('«'.$instid.'» just died!',true,true); } } else { gecho('«'.$inst.'» exists in Peers table but there’s no data about it in PeersChecks!'.N,true,true); } } $query='UPDATE Peers SET Hostname=\''.myesc($link,$inst).'\', IsDead='.$dead.' WHERE ID='.$instid; } else { $query='INSERT INTO Peers SET Hostname=\''.myesc($link,$inst).'\', IsDead=0'; } myq($link,$query); if ($instid==0) $instid=mysqli_insert_id($link); myq($link,'INSERT INTO PeersChecks SET InstID='.$instid.', Time='.$now.', Status='.$responded); } if (count($nlist)>0) { unset($list); crawl($nlist,$id+1); $maxround=$id+1; } else { gecho('Next round list is empty.'.N,true,false); } gecho('###### END OF ROUND '.$id.' ######'.N,true,false); } function mexit($msg,$code) { global $link, $lockfp; if (isset($link) && $link!==false) mysqli_close($link); if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp); if ($code==0) echo($msg); else fwrite(STDERR,$msg); exit($code); } function gecho($msg,$prtime,$iserr) { if ($prtime) $msg=microdate().' '.$msg; if ($iserr) fwrite(STDERR,$msg); else echo($msg); } function myq(&$link,$query) { try { $res=mysqli_query($link,$query); } catch (Exception $error) { mexit('Error: query «'.$query.'» failed: '.$error->getMessage().' ('.$error->getCode().').'.N,2); } // for php versions < 8, which seem to not catch mysql exceptions if ($res===false) mexit('Error: query «'.$query.'» failed: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,2); return($res); } function microdate($time=null) { if (is_null($time)) $time=microtime(false); $time=explode(' ',$time); return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2)); } function sortcheckandsave(&$arr,$arrdesc,&$fp) { $buc=count($arr); $arr=array_unique($arr); $auc=count($arr); if ($buc!=$auc) gecho('Warning: '.$arrdesc.' contained duplicates, better check code ;-)'.N,true,true); gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false); sort($arr); $f=@fopen($fp,'w'); if ($f!==false) { foreach ($arr as $val) fwrite($f,$val.N); fclose($f); } else { gecho('Error: couldn’t open «'.$fp.'» for writing.'.N,true,true); } } function shutdown($dosort) { global $opts, $peersf, $apeersf, $cpeersf, $insts, $ainsts, $cinsts, $lockfp; if (isset($peersf) && $peersf!==false) @fclose($peersf); if (isset($apeersf) && $apeersf!==false) @fclose($apeersf); if (isset($cpeersf) && $cpeersf!==false) @fclose($cpeersf); if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp); if ($dosort) { sortcheckandsave($insts,'list of responding instances',$opts['peersfp']); sortcheckandsave($cinsts,'list of checked instances',$opts['cpeersfp']); sortcheckandsave($ainsts,'list of all instances',$opts['apeersfp']); } } function signalHandler($signal) { echo(N.'Interrupted (signal: '.$signal.').'.N); shutdown(false); exit(3); } function isempty($val) { if (preg_match('/^\s*$/',$val)===1) return(true); else return(false); } function waituntilonline() { $url='www.google.com'; $gotoff=false; while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) { $gotoff=true; gecho('Warning: it seems we are offline, waiting 30 seconds before retrying :-('.N,true,true); sleep(30); } fclose($f); if ($gotoff) gecho('It seems we are back online! :-)'.N,true,false); } function updexarr() { global $exarr, $opts; if (!is_null($opts['excludefp'])) { $f=@fopen($opts['excludefp'],'r'); if ($f!==false) { $i=0; $exarr=[]; while (!feof($f)) { $i++; $line=trim(fgets($f)); if (!isempty($line)) { if (@preg_match($line,'foo')!==false) $exarr[]=$line; else gecho('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N,true,true); } } } else { gecho('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N,true,true); } } } function ckexarr($inst) { global $exarr; foreach ($exarr as $re) if (preg_match($re,$inst)===1) return(true); return(false); } function ismultibyte($s) { preg_replace('/./u','.',$s,-1,$c); (strlen($s)>$c) ? $r=true : $r=false; return($r); } function validhostname($hostname) { //$hostname=preg_replace('#/.*#','',$hostname); //$hostname=preg_replace('#:[0-9]+$#','',$hostname); if (ismultibyte($hostname)) $hostname=idn_to_ascii($hostname,IDNA_DEFAULT,INTL_IDNA_VARIANT_UTS46); //echo($hostname.N); if (strlen($hostname)>253) return(false); $labels=explode('.',$hostname); foreach($labels as $label) { $len=strlen($label); if ($len<1 || $len>63) return(false); if (preg_match('#^-#',$label)==1) return(false); if (preg_match('#-$#',$label)==1) return(false); //if (preg_match('#--#',$label)==1) return(false); if (preg_match('#^[a-zA-Z0-9-]+$#',$label)!==1) return(false); } return(true); } //$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die(); ?>