#!/usr/bin/php . */ const N="\n"; require(__DIR__.'/../site/mustard/include/gurl.php'); setlocale(LC_ALL,getenv('LANG')); $opts=array( 'inifp'=>__DIR__.'/../conf/mustard.ini', 'startinst'=>'mastodon.social', 'peersfp'=>__DIR__.'/peers', 'allpeersfp'=>__DIR__.'/peers.all', 'restore'=>false, 'excludefp'=>null, 'timeout'=>5, 'verbose'=>false, 'excludedead'=>false, 'timezone'=>date_default_timezone_get(), 'ignorelock'=>false ); $help='peerscrawl.php DESCRIPTION This program tries to build a fairly complete list of fediverse instances exposing the [instance]/api/v1/instance/peers endpoint. SYNOPSIS peerscrawl.php [options] OPTIONS -s, --startinst Defines the first instance to crawl. DEFAULT: «'.$opts['startinst'].'» -p, --peersfp Defines the file into which the ordered list of responding instances will be saved. DEFAULT: «'.$opts['peersfp'].'» -a, --allpeersfp Defines the file into which the ordered list of all checked instances will be saved. DEFAULT: «'.$opts['allpeersfp'].'» -I, --ignorelock Normally, if its lockfile exists, the program exits with an error before doing anything. With this option the lockfile is ignored. Please verify that the program is not already running before using it. -r, --restore If peers file already exists on program’s start it will be loaded into memory and each instance it contains will be considered “already crawled”, thus allowing to “restore an interrupted crawling session”. -e, --excludefp Defines a file containing exclusion rules: one regular expression per line (empty lines are ignored). Any instance matching any defined regex will be ignored by the program. Changes made to this file during program execution will be taken into account. -t, --timeout Defines the timeout in seconds for every connection attempt. DEFAULT: «'.$opts['timeout'].'» -T, --timezone Defines the timezone for displaying localized values for dates and times. DEFAULT on this system: «'.$opts['timezone'].'» Note: if you want localized format as well set LANG environment variable. -L, --tzlist List all valid timezones and exit. -E, --excludedead Exclude instances marked as "Dead" in the database. -v, --verbose Be more verbose. -h, --help Show this help text and exit. This program comes with ABSOLUTELY NO WARRANTY; for details see the source. This is free software, and you are welcome to redistribute it under certain conditions; see for details.'.N; for ($i=1; $i<$argc; $i++) { if (substr($argv[$i],0,1)=='-') { switch($argv[$i]) { case '-s': case '--startinst': if ($i+1>=$argc) mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1); $i++; $opts['startinst']=$argv[$i]; break; case '-p': case '--peersfp': if ($i+1>=$argc) mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['peersfp']=$argv[$i]; break; case '-a': case '--allpeersfp': if ($i+1>=$argc) mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['allpeersfp']=$argv[$i]; break; case '-r': case '--restore': $opts['restore']=true; break; case '-I': case '--ignorelock': $opts['ignorelock']=true; break; case '-e': case '--excludefp': if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1])) mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable file’s path (use «-h» for more info).'.N,1); $i++; $opts['excludefp']=$argv[$i]; break; case '-t': case '--timeout': if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1) mexit('Option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1); $i++; $opts['timeout']=$argv[$i]+0; break; case '-T': case '--timezone': if ($i+1>=$argc || !@date_default_timezone_set($argv[$i+1])) mexit('Option «'.$argv[$i].'» has to be followed by a valid timezone identifier (use «-h» for more info).'.N,1); $i++; $opts['timezone']=$argv[$i]; break; case '-L': case '--tzlist': $buf=timezone_identifiers_list(); foreach ($buf as $val) gecho($val.N,false,false); exit(0); break; case '-E': case '--excludedead': $opts['excludedead']=true; break; case '-v': case '--verbose': $opts['verbose']=true; break; case '-h': case '--help': mexit($help,0); break; default: mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1); break; } } } $lockfp=__DIR__.'/peerscrawl.lock'; if (is_file($lockfp) && !$opts['ignorelock']) { echo('Lockfile exists: it seems the program is already running; if you’re sure it’s not true, use «-I» to force execution.'.N); exit(2); } touch($lockfp); function mexit($msg,$code) { global $link, $lockfp; if (isset($link) && $link!==false) mysqli_close($link); if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp); if ($code==0) echo($msg); else fwrite(STDERR,$msg); exit($code); } function gecho($msg,$prtime,$iserr) { if ($prtime) $msg=microdate().' '.$msg; if ($iserr) fwrite(STDERR,$msg); else echo($msg); } function microdate($time=null) { if (is_null($time)) $time=microtime(false); $time=explode(' ',$time); return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2)); } function sortcheckandsave(&$arr,$arrdesc,&$fp) { $buc=count($arr); $arr=array_unique($arr); $auc=count($arr); if ($buc!=$auc) gecho('WARNING: '.$arrdesc.' contained '.($buc-$auc).' duplicates, better check my code ;-)'.N,true,true); gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false); sort($arr); $f=@fopen($fp,'w'); if ($f!==false) { foreach ($arr as $val) fwrite($f,$val.N); fclose($f); } else { gecho('ERROR: couldn’t open «'.$fp.'» for writing.'.N,true,true); } } function shutdown($dosort) { global $opts, $peersf, $allpeersf, $insts, $ainsts, $lockfp; if (isset($peersf) && $peersf!==false) @fclose($peersf); if (isset($allpeersf) && $allpeersf!==false) @fclose($allpeersf); if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp); if ($dosort) { sortcheckandsave($insts,'list of responding instances',$opts['peersfp']); sortcheckandsave($ainsts,'list of all checked instances',$opts['allpeersfp']); } } // ATTENZIONE: se lo script viene pipato, tipo "script.php | tee script.log", // la funzione viene eseguita, anche se l'output della stessa non viene mostrato function signalHandler($signal) { echo(N.'I got interrupted (signal: '.$signal.').'.N); shutdown(false); //touch('KILLED'); exit(3); } //declare(ticks=1); pcntl_async_signals(true); pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called) pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed) $deadinsts=array(); if ($opts['excludedead']) { $iniarr=@parse_ini_file($opts['inifp']) or mexit('ERROR: I couldn’t open «'.$opts['inifp'].'».'.N,2); $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']) or mexit('ERROR: I couldn’t connect to MySQL server: '.mysqli_connect_error().N,2); mysqli_set_charset($link,'utf8mb4') or mexit('ERROR trying to set MySQL client charset: '.__LINE__.': '.mysqli_error($link).N,2); $res=mysqli_query($link,'SELECT URI FROM Instances WHERE Dead=1') or mexit('ERROR: '.__LINE__.': '.mysqli_error($link).N,2); mysqli_close($link); while ($row=mysqli_fetch_assoc($res)) $deadinsts[]=$row['URI']; unset($res); gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false); } $insts=array(); $ainsts=array(); $exarr=array(); if ($opts['restore']) { if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) { gecho('Loading «'.$opts['peersfp'].'».'.N,true,false); $insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES); } else { mexit('WARNING: I couldn’t open «'.$opts['peersfp'].'» for reading.'.N,2); } } $peersf=@fopen($opts['peersfp'],'w'); if (!$peersf) mexit('I could not open «'.$opts['peersfp'].'» in write mode.'.N,2); $allpeersf=@fopen($opts['allpeersfp'],'w'); if (!$allpeersf) mexit('I could not open «'.$opts['allpeersfp'].'» in write mode.'.N,2); function isempty($val) { if (preg_match('/^\s*$/',$val)===1) return(true); else return(false); } function waituntilonline() { $url='www.google.com'; while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) { gecho('WARNING: it seems we are offline :-('.N,true,true); sleep(5); } fclose($f); // gecho('It seems we are online! :-)'.N,true,false); } function updexarr() { global $exarr, $opts; if (!is_null($opts['excludefp'])) { $f=@fopen($opts['excludefp'],'r'); if ($f!==false) { $i=0; $exarr=array(); while (!feof($f)) { $i++; $line=trim(fgets($f)); if (!isempty($line)) { if (@preg_match($line,'foo')!==false) $exarr[]=$line; else gecho('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N,true,true); } } } else { gecho('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N,true,true); } } } function ckexarr($inst) { global $exarr; foreach ($exarr as $re) if (preg_match($re,$inst)===1) return(true); return(false); } require(__DIR__.'/../site/mustard/include/ghs.php'); require(__DIR__.'/../site/mustard/include/ght.php'); function ismultibyte($s) { preg_replace('/./u','.',$s,-1,$c); (strlen($s)>$c) ? $r=true : $r=false; return($r); } function validhostname($url) { //$hostname=preg_replace('#/.*#','',$url); //$hostname=preg_replace('#:[0-9]+$#','',$hostname); if (ismultibyte($hostname)) $hostname=idn_to_ascii($hostname); //echo($hostname.N); if (strlen($hostname)>253) return(false); $labels=explode('.',$hostname); foreach($labels as $label) { $len=strlen($label); if ($len<1 || $len>63) return(false); if (preg_match('#^-#',$label)==1) return(false); if (preg_match('#-$#',$label)==1) return(false); //if (preg_match('#--#',$label)==1) return(false); if (preg_match('#^[a-zA-Z0-9-]+$#',$label)!==1) return(false); } return(true); } //$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die(); function crawl($list,$id) { global $ainsts, $insts, $deadinsts, $peersf, $allpeersf, $opts, $tini; gecho('~~~~~~~ START OF ROUND '.$id.' ~~~~~~~'.N,true,false); waituntilonline(); updexarr(); foreach ($list as $inst) { if (!in_array($inst,$ainsts)) { $ainsts[]=$inst; fwrite($allpeersf,$inst.N); } } $nlist=array(); $c=count($list); $i=0; $rtini=time(); foreach ($list as $inst) { $i++; $now=time(); $rtela=$now-$rtini; gecho('>>> '.$inst.N,true,false); gecho('@@@ Round '.$id.', '.$i.'/'.$c.': TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' responding insts; '.count($nlist).' insts in next round list; '.count($ainsts).' total.'.N,true,false); gecho('Trying to load «'.$inst.'»’s peers...'.N,true,false); $peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout']); if ($peers['cont']===false) { gecho('ERROR: '.$peers['emsg'].N,true,true); } else { $peers=@json_decode($peers['cont'],true); if (!is_array($peers)) { gecho('ERROR: $peers is not an array (its type is '.gettype($peers).').'.N,true,true); } else { gecho('LOADED!'.N,true,false); if (in_array($inst,$insts)) { gecho('NOTICE: «'.$inst.'» is not a new instance (it was already in $insts).'.N,true,false); } else { gecho('NEW INSTANCE FOUND: «'.$inst.'».'.N,true,false); $insts[]=$inst; fwrite($peersf,$inst.N); } foreach ($peers as $peer) { if (!is_string($peer)) { gecho(' ERROR: I won’t add this peer to next round list because its name is not a string.'.N,true,true); } elseif (!validhostname($peer)) { gecho(' ERROR: I won’t add «'.$peer.'» to next round list because it’s not a valid hostname.'.N,true,true); } elseif (ckexarr($peer)) { gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,false); } elseif (in_array($peer,$ainsts)) { if ($opts['verbose']) gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $ainsts.'.N,true,false); } elseif (in_array($peer,$nlist)) { if ($opts['verbose']) gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $nlist.'.N,true,false); // questo qui sotto diventa ridondante ora che uso $ainsts e lo popolo a inizio funzione /*} elseif (in_array($peer,$list)) { if ($opts['verbose']) gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $list.'.N,true,false); // questo qui sotto è sempre stato ridondante } elseif (in_array($peer,$insts)) { if ($opts['verbose']) gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $insts.'.N,true,false); }*/ } elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) { gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s DEAD.'.N,true,false); } else { //EVVIVA! gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false); $nlist[]=$peer; } } } } } if (count($nlist)>0) { unset($list); crawl($nlist,$id+1); } else { gecho('Next round list is empty.'.N,true,false); } gecho('~~~~~~~ END OF ROUND '.$id.' ~~~~~~~'.N,true,false); } $tini=time(); crawl(array($opts['startinst']),1); gecho('DONE CRAWLING! :-)'.N,true,false); shutdown(true); $now=time(); gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false); gecho(count($ainsts).' URIs checked in '.ght($now-$tini).'; '.count($insts).' responded. Max memory usage: '.ghs(memory_get_peak_usage(true)).N,true,false); exit(0); ?>