#!/usr/bin/php . */ setlocale(LC_ALL,getenv('LANG')); define('N',"\n"); $opts=array( 'startinst'=>'mastodon.social', 'peersfp'=>__DIR__.'/peers', 'restore'=>false, 'excludefp'=>null ); $help='peerscrawl.php DESCRIPTION This program tries to build a fairly complete list of mastodon instances. SYNOPSIS peerscrawl.php [options] OPTIONS -s, --startinst Defines the first instance to crawl. DEFAULT: «'.$opts['startinst'].'» -p, --peersfp Defines the file into which the ordered list of instances will be saved. DEFAULT: «'.$opts['peersfp'].'» -r, --restore If peers file already exists on program’s start it will be loaded into memory and each instance it contains will be considered “already crawled”, thus allowing to “restore an interrupted crawling session”. -e, --excludefp Defines a file containing exclusion rules: one regular expression per line (empty lines are ignored). Any instance matching any defined regex will be ignored by the program. Changes made to this file during program execution will be taken into account. This program comes with ABSOLUTELY NO WARRANTY; for details see the source. This is free software, and you are welcome to redistribute it under certain conditions; see for details.'.N; for ($i=1; $i<$argc; $i++) { if (substr($argv[$i],0,1)=='-') { switch($argv[$i]) { case '-s': case '--startinst': if ($i+1>=$argc) mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1); $i++; $opts['startinst']=$argv[$i]; break; case '-p': case '--peersfp': if ($i+1>=$argc) mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1); $i++; $opts['peersfp']=$argv[$i]; break; case '-r': case '--restore': $opts['restore']=true; $i++; break; case '-e': case '--excludefp': if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1])) mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable file’s path (use «-h» for more info).'.N,1); $i++; $opts['excludefp']=$argv[$i]; break; case '-h': case '--help': mexit($help,0); break; default: mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1); break; } } } function mexit($msg,$code) { echo($msg); exit($code); } function shutdown($dosort) { global $opts, $peersf, $insts; if ($peersf) @fclose($peersf); if ($dosort) { echo('Saving ordered instances list into «'.$opts['peersfp'].'».'.N); sort($insts); $peersf=@fopen($opts['peersfp'],'w'); if ($peersf!==false) { foreach ($insts as $inst) fwrite($peersf,$inst.N); fclose($peersf); } else { echo('Couldn’t open «'.$opts['peersfp'].'» for writing.'.N); } } } declare(ticks=1); function signalHandler($signal) { echo(N.'I got interrupted (signal: '.$signal.').'.N); shutdown(false); exit(2); } pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called) pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed) $contextopts=array( 'http'=>array( 'timeout'=>3 ), 'socket'=>array( 'tcp_nodelay'=>true ) ); $context=stream_context_create($contextopts); $insts=array(); $exarr=array(); if ($opts['restore']) { if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) { echo('Loading «'.$opts['peersfp'].'».'.N); $insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES); } else { echo('WARNING: I couldn’t open «'.$opts['peersfp'].'» for reading.'.N); } } $peersf=@fopen($opts['peersfp'],'w'); function isempty($val) { if (preg_match('/^\s*$/',$val)===1) return(true); else return(false); } function waituntilonline() { global $context; $url='www.google.com'; while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) { echo(strftime('%c').' - WARNING: it seems we are offline :-('.N); sleep(5); } fclose($f); // echo(strftime('%c').' - it seems we are online! :-)'.N); } function updexarr() { global $exarr, $opts; if (!is_null($opts['excludefp'])) { $f=@fopen($opts['excludefp'],'r'); if ($f!==false) { $i=0; $exarr=array(); while (!feof($f)) { $i++; $line=trim(fgets($f)); if (!isempty($line)) { if (@preg_match($line,'foo')!==false) $exarr[]=$line; else echo('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N); } } } else { echo('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N); } } } function ckexarr($inst) { global $exarr; foreach ($exarr as $re) if (preg_match($re,$inst)===1) return(true); return(false); } function crawl($inst) { global $insts, $peersf, $context; waituntilonline(); updexarr(); if (!isempty($inst)) { // questo qui sotto è ridondante su un lancio normale, ma serve per evitare che l'istanza di partenza finisca nella lista 2 volte quando lo si lancia con "--restore" if (!in_array($inst,$insts)) { echo('«'.$inst.'» is not a known instance, I add it to the list of known instances.'.N); $insts[]=$inst; fwrite($peersf,$inst.N); } echo('«'.$inst.'»: trying to load instance’s peers ... '); $peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context); if ($peers!=false) { echo('OK :-)'.N); $peers=json_decode($peers,true); if (is_array($peers)) { foreach ($peers as $peer) { if (is_string($peer)) { if (!ckexarr($peer)) { if (!in_array($peer,$insts)) { echo('>>> I will crawl «'.$peer.'».'.N); crawl($peer); }/* else { echo('>>> I won’t crawl «'.$peer.'» because I already did.'.N); }*/ } else { echo('>>> I won’t crawl «'.$peer.'» because its name matches with an exclusion regex.'.N); } } else { echo('>>> I won’t crawl this peer because its name is not a string.'.N); } } } } else { echo('ERROR :-('.N); } } else { echo('I WON’T add nameless instances.'.N); } echo('~~~~~~~ Stats: '.count($insts).' known istances ~~~~~~~'.N); } crawl($opts['startinst']); echo('DONE CRAWLING! :-)'.N); shutdown(true); exit(0); ?>