MastodonHelp/web/clitools/peerscrawl.php

492 lines
18 KiB
PHP
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('N',"\n");
define('SNAME',basename(__FILE__));
define('BNAME',preg_replace('/\.[^.]*$/','',SNAME));
require(__DIR__.'/../site/mustard/include/gurl.php');
require(__DIR__.'/../site/mustard/include/ghs.php');
require(__DIR__.'/../site/mustard/include/ght.php');
use function mysqli_real_escape_string as myesc;
$opts=[
'inifp'=>__DIR__.'/../conf/mustard.ini',
'startinst'=>'mastodon.social',
'deadline'=>62*24*60*60,
'peersfp'=>__DIR__.'/peers',
'apeersfp'=>__DIR__.'/peers.all',
'cpeersfp'=>__DIR__.'/peers.checked',
'restore'=>false,
'excludefp'=>null,
'timeout'=>8,
'curltimeout'=>15,
'loop'=>false,
'verbose'=>false,
'excludedead'=>false,
'ignorelock'=>false
];
$help='SYNOPSIS
'.SNAME.' [options]
DESCRIPTION
This program tries to build a fairly complete list of fediverse instances
exposing the [instance]/api/v1/instance/peers endpoint.
OPTIONS
-s, --startinst <domain>
Defines the first instance to crawl.
DEFAULT: «'.$opts['startinst'].
-p, --peersfp <file>
Defines the file into which the ordered list of responding instances
will be saved.
DEFAULT: «'.$opts['peersfp'].
-a, --apeersfp <file>
Defines the file into which the ordered list of all instances will
be saved.
DEFAULT: «'.$opts['apeersfp'].
-c, --cpeersfp <file>
Defines the file into which the ordered list of all checked instances will
be saved.
DEFAULT: «'.$opts['cpeersfp'].
-I, --ignorelock
Normally, if its lockfile exists, the program exits with an error before
doing anything. With this option the lockfile is ignored. Please verify
that the program is not already running before using it.
-r, --restore
>>> Currently not working, causes script to just exit with an error message.
If peers files «peers», «peers.all», «peers.checked» exist on programs
start they will be loaded, thus allowing to restore an interrupted previous
crawling session. This option is mutually exclusive with the «loop» option.
-e, --excludefp <file>
Defines a file containing exclusion rules: one regular expression per
line (empty lines are ignored). Any instance matching any defined regex
will be ignored by the program. Changes made to this file during program
execution will be taken into account.
-E, --excludedead
Exclude instances marked as “Dead” in the database.
-l, --loop
Normally the script will exit after completing a crawl; if this option
is set, it will restart crawling until it receives a SIGTERM, SIGHUP
or SIGINT. This option is mutually exclusive with the «restore» option.
-t, --timeout <seconds>
Defines the timeout in seconds for every connection attempt.
DEFAULT: «'.$opts['timeout'].
-T, --curltimeout <seconds>
Defines the timeout in seconds for every download.
DEFAULT: «'.$opts['curltimeout'].
-v, --verbose
Be more verbose.
-h, --help
Show this help text and exit.
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under certain
conditions; see <http://www.gnu.org/licenses/> for details.'.N;
for ($i=1; $i<$argc; $i++) {
if ($argv[$i]=='-s' || $argv[$i]=='--startinst') {
if ($i+1>=$argc)
mexit('Error: option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
$i++;
$opts['startinst']=$argv[$i];
} elseif ($argv[$i]=='-p' || $argv[$i]=='--peersfp') {
if ($i+1>=$argc)
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
} elseif ($argv[$i]=='-a' || $argv[$i]=='--apeersfp') {
if ($i+1>=$argc)
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['apeersfp']=$argv[$i];
} elseif ($argv[$i]=='-c' || $argv[$i]=='--cpeersfp') {
if ($i+1>=$argc)
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['cpeersfp']=$argv[$i];
} elseif ($argv[$i]=='-r' || $argv[$i]=='--restore') {
$opts['restore']=true;
} elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') {
$opts['ignorelock']=true;
} elseif ($argv[$i]=='-e' || $argv[$i]=='--excludefp') {
if ($i+1>=$argc)
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['excludefp']=$argv[$i];
} elseif ($argv[$i]=='-t' || $argv[$i]=='--timeout') {
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
mexit('Error: option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1);
$i++;
$opts['timeout']=$argv[$i]+0;
} elseif ($argv[$i]=='-T' || $argv[$i]=='--curltimeout') {
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
mexit('Error: option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1);
$i++;
$opts['curltimeout']=$argv[$i]+0;
} elseif ($argv[$i]=='-E' || $argv[$i]=='--excludedead') {
$opts['excludedead']=true;
} elseif ($argv[$i]=='-l' || $argv[$i]=='--loop') {
$opts['loop']=true;
} elseif ($argv[$i]=='-v' || $argv[$i]=='--verbose') {
$opts['verbose']=true;
} elseif ($argv[$i]=='-h' || $argv[$i]=='--help') {
mexit($help,0);
} else {
mexit('Error: dont know how to interpret «'.$argv[$i].'» (use «-h» to read the help text).'.N,1);
}
}
if ($opts['restore']) mexit('Error: “restore” options code has to be finished, it currently doesnt work; exiting.'.N,1);
if ($opts['loop'] && $opts['restore']) mexit('Error: “loop” and “restore” options are mutually exclusive (use «-h» to read the help text).'.N,1);
$lockfp=__DIR__.'/'.BNAME.'.lock';
if (is_file($lockfp) && !$opts['ignorelock']) {
gecho('Error: lockfile exists: it seems the program is already running; if youre sure its not, you can use «-I» to force execution.'.N,false,true);
exit(1);
}
if (@touch($lockfp)===false) mexit('Error: could not create lockfile «'.$lockfp.'».'.N,false,true);
//declare(ticks=1);
pcntl_async_signals(true);
pcntl_signal(SIGTERM,'sighandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'sighandler');// Terminal log-out
pcntl_signal(SIGINT,'sighandler');// Interrupted (Ctrl-C is pressed)
$iniarr=@parse_ini_file($opts['inifp'])
or mexit('Error: couldnt open «'.$opts['inifp'].'».'.N,1);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
catch (Exception $error) { mexit('Error: couldnt connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); }
// for php versions < 8
if ($link===false) mexit('Error: couldnt connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true);
try { $res=mysqli_set_charset($link,'utf8mb4'); }
catch (Exception $error) { mexit('Error: couldnt set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).'.'.N,1,true); }
// for php versions < 8
if ($res===false) mexit('Error: couldnt set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,1,true);
$deadinsts=[];
if ($opts['excludedead']) {
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=1');
while ($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$deadinsts))
$deadinsts[]=$row['URI'];
$res=myq($link,'SELECT Hostname FROM Peers WHERE IsDead=1');
while ($row=mysqli_fetch_assoc($res))
if (!in_array($row['Hostname'],$deadinsts))
$deadinsts[]=$row['URI'];
unset($res,$row);
gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false);
}
$insts=[];
$cinsts=[];
$ainsts=[];
$exarr=[];
$mode=['mode'=>'w','desc'=>'write'];
if ($opts['restore']) {
$insts=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($insts===false) mexit('Error: couldnt open «'.$opts['peersfp'].'» for reading.'.N,1);
$cinsts=@file($opts['cpeersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($cinsts===false) mexit('Error: couldnt open «'.$opts['cpeersfp'].'» for reading.'.N,1);
$ainsts=@file($opts['apeersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($ainsts===false) mexit('Error: couldnt open «'.$opts['apeersfp'].'» for reading.'.N,1);
$mode=['mode'=>'a','desc'=>'append'];
gecho('Succesfully restored previous session :-)'.N,true,false);
}
do {
$peersf=@fopen($opts['peersfp'],$mode['mode']);
if ($peersf===false) mexit('Error: couldnt open «'.$opts['peersfp'].'» in '.$mode['desc'].' mode.'.N,1);
$apeersf=@fopen($opts['apeersfp'],$mode['mode']);
if ($apeersf===false) mexit('Error: couldnt open «'.$opts['apeersfp'].'» in '.$mode['desc'].' mode.'.N,1);
$cpeersf=@fopen($opts['cpeersfp'],$mode['mode']);
if ($cpeersf===false) mexit('Error: couldnt open «'.$opts['cpeersfp'].'» in '.$mode['desc'].' mode.'.N,1);
$maxround=1;
$tini=time();
// go
if ($opts['restore'])
crawl($insts,1);
else
crawl([$opts['startinst']],1);
gecho('Done crawling! :-)'.N,true,false);
$now=time();
gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false);
gecho(count($ainsts).' URIs checked in '.ght($now-$tini).' ('.$maxround.' rounds); '.count($insts).' responded. Max. memory usage: '.ghs(memory_get_peak_usage(true)).'.'.N,true,false);
fclose($peersf);
fclose($cpeersf);
fclose($apeersf);
sortcheckandsave($insts,'list of responding instances',$opts['peersfp']);
sortcheckandsave($cinsts,'list of checked instances',$opts['cpeersfp']);
sortcheckandsave($ainsts,'list of all instances',$opts['apeersfp']);
$insts=[];
$cinsts=[];
$ainsts=[];
} while ($opts['loop']);
mysqli_close($link);
unlink($lockfp);
exit(0);
// functions
function crawl($list,$id) {
global $insts, $deadinsts, $cinsts, $ainsts, $tini, $opts, $peersf, $cpeersf, $apeersf, $maxround, $link;
gecho('###### START OF ROUND '.$id.' ######'.N,true,false);
$nlist=[];
$c=count($list);
$i=0;
$rtini=time();
foreach ($list as $inst) {
if (!in_array($inst,$ainsts)) {
$ainsts[]=$inst;
fwrite($apeersf,$inst.N);
}
$i++;
$now=time();
$rtela=$now-$rtini;
gecho('Working on «'.$inst.'»: round '.$id.', '.$i.'/'.$c.'; TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' discovered instances; '.count($nlist).' instances in next round list.'.N,true,false);
waituntilonline();
updexarr();
gecho('Trying to load «'.$inst.s peers...'.N,true,false);
$peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout'],$opts['curltimeout']);
$cinsts[]=$inst;// don't need to chech if in_array
fwrite($cpeersf,$inst.N);
$responded=0;
if ($peers['cont']===false) {
gecho('Error loading «'.$inst.s peers: '.$peers['emsg'].'.'.N,true,true);
} else {
$peers=@json_decode($peers['cont'],true);
if (!is_array($peers)) {
gecho('Error loading «'.$inst.s peers: got not good JSON.'.N,true,true);
} else {
gecho('Successfully loaded «'.$inst.s peers :-)'.N,true,false);
$responded=1;
if (!in_array($inst,$insts)) {
gecho('Discovered instance «'.$inst.'» :-)'.N,true,false);
$insts[]=$inst;
fwrite($peersf,$inst.N);
}
foreach ($peers as $peer) {
if (!in_array($peer,$ainsts)) {
$ainsts[]=$peer;
fwrite($apeersf,$peer.N);
}
$whynot=[];
if (in_array($peer,$cinsts)) $whynot[]='it has already been checked';
if (!is_string($peer)) $whynot[]='its name is not a string';
if (!validhostname($peer)) $whynot[]='its name is not a valid hostname';
if (ckexarr($peer)) $whynot[]='its name matches an exclusion regexp';
if (in_array($peer,$list)) $whynot[]='it is already present in current list';
if (in_array($peer,$nlist)) $whynot[]='it has already been added to next round list';
if ($opts['excludedead'] && in_array($peer,$deadinsts)) $whynot[]='its dead';
if (count($whynot)>0) {
if ($opts['verbose']) gecho(' Not adding peer «'.$peer.'» to next round list because '.implode(', ',$whynot).'.'.N,true,true);
} else {
if ($opts['verbose']) gecho(' Adding peer «'.$peer.'» to next round list :-)'.N,true,false);
$nlist[]=$peer;
}
}
}
}
$instid=0;
$res=myq($link,'SELECT * FROM Peers WHERE Hostname=\''.myesc($link,$inst).'\'');
if (mysqli_num_rows($res)>0) {
$row=mysqli_fetch_assoc($res);
$instid=$row['ID'];
$dead=0;
if (!$responded) {
// we check the last time instance responded, if ever
$res=myq($link,'SELECT Time FROM PeersChecks WHERE InstID='.$instid.' AND Status=1 ORDER BY Time DESC LIMIT 1',__LINE__);
// if instance never responded we consider the time of first check
if (mysqli_num_rows($res)==0)
$res=myq($link,'SELECT Time FROM PeersChecks WHERE InstID='.$instid.' ORDER BY Time ASC LIMIT 1',__LINE__);
if (mysqli_num_rows($res)>0) {
$row=mysqli_fetch_assoc($res);
if ($now-$row['Time']>$opts['deadline']) {
$dead=1;
gecho('«'.$instid.'» just died!',true,true);
}
} else {
gecho('«'.$inst.'» exists in Peers table but theres no data about it in PeersChecks!'.N,true,true);
}
}
$query='UPDATE Peers SET Hostname=\''.myesc($link,$inst).'\', IsDead='.$dead.' WHERE ID='.$instid;
} else {
$query='INSERT INTO Peers SET Hostname=\''.myesc($link,$inst).'\', IsDead=0';
}
myq($link,$query);
if ($instid==0) $instid=mysqli_insert_id($link);
myq($link,'INSERT INTO PeersChecks SET InstID='.$instid.', Time='.$now.', Status='.$responded);
}
if (count($nlist)>0) {
unset($list);
crawl($nlist,$id+1);
$maxround=$id+1;
} else {
gecho('Next round list is empty.'.N,true,false);
}
gecho('###### END OF ROUND '.$id.' ######'.N,true,false);
}
function mexit($msg,$code) {
global $link, $peersf, $cpeersf, $apeersf, $lockfp;
if (isset($link) && $link!==false) mysqli_close($link);
if (isset($peersf) && $peersf!==false) @fclose($peersf);
if (isset($cpeersf) && $cpeersf!==false) @fclose($cpeersf);
if (isset($apeersf) && $apeersf!==false) @fclose($apeersf);
if (isset($lockfp) && is_file($lockfp)) unlink($lockfp);
if ($code==0)
echo($msg);
else
fwrite(STDERR,$msg);
exit($code);
}
function gecho($msg,$prtime,$iserr) {
if ($prtime)
$msg=microdate().' '.$msg;
if ($iserr)
fwrite(STDERR,$msg);
else
echo($msg);
}
function myq(&$link,$query) {
try { $res=mysqli_query($link,$query); }
catch (Exception $error) { mexit('Error: query «'.$query.'» failed: '.$error->getMessage().' ('.$error->getCode().').'.N,2); }
// for php versions < 8, which seem to not catch mysql exceptions
if ($res===false) mexit('Error: query «'.$query.'» failed: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,2);
return($res);
}
function microdate($time=null) {
if (is_null($time)) $time=microtime(false);
$time=explode(' ',$time);
return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2));
}
function sortcheckandsave(&$arr,$arrdesc,&$fp) {
$buc=count($arr);
$arr=array_unique($arr);
$auc=count($arr);
if ($buc!=$auc) gecho('Warning: '.$arrdesc.' contained duplicates, better check code ;-)'.N,true,true);
gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false);
sort($arr);
$f=@fopen($fp,'w');
if ($f!==false) {
foreach ($arr as $val)
fwrite($f,$val.N);
fclose($f);
} else {
gecho('Error: couldnt open «'.$fp.'» for writing.'.N,true,true);
}
}
function sighandler($signal) {
global $peersf, $cpeersf, $apeersf, $lockfp;
if (isset($peersf) && $peersf!==false) @fclose($peersf);
if (isset($cpeersf) && $cpeersf!==false) @fclose($cpeersf);
if (isset($apeersf) && $apeersf!==false) @fclose($apeersf);
if (isset($lockfp) && is_file($lockfp)) unlink($lockfp);
echo(N.'Interrupted (signal: '.$signal.').'.N);
exit(0);
}
function isempty($val) {
if (preg_match('/^\s*$/',$val)===1)
return(true);
else
return(false);
}
function waituntilonline() {
$url='www.google.com';
$gotoff=false;
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
$gotoff=true;
gecho('Warning: it seems we are offline, waiting 10 seconds before retrying...'.N,true,true);
sleep(10);
}
fclose($f);
if ($gotoff) gecho('It seems we are back online! :-)'.N,true,false);
}
function updexarr() {
global $exarr, $opts;
if (!is_null($opts['excludefp'])) {
$f=@fopen($opts['excludefp'],'r');
if ($f!==false) {
$i=0;
$exarr=[];
while (!feof($f)) {
$i++;
$line=trim(fgets($f));
if (!isempty($line)) {
if (@preg_match($line,'foo')!==false)
$exarr[]=$line;
else
gecho('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N,true,true);
}
}
} else {
gecho('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N,true,true);
}
}
}
function ckexarr($inst) {
global $exarr;
foreach ($exarr as $re)
if (preg_match($re,$inst)===1) return(true);
return(false);
}
function ismultibyte($s) {
preg_replace('/./u','.',$s,-1,$c);
(strlen($s)>$c) ? $r=true : $r=false;
return($r);
}
function validhostname($hostname) {
//$hostname=preg_replace('#/.*#','',$hostname);
//$hostname=preg_replace('#:[0-9]+$#','',$hostname);
if (ismultibyte($hostname)) $hostname=idn_to_ascii($hostname,IDNA_DEFAULT,INTL_IDNA_VARIANT_UTS46);
//echo($hostname.N);
if (strlen($hostname)>253) return(false);
$labels=explode('.',$hostname);
foreach($labels as $label) {
$len=strlen($label);
if ($len<1 || $len>63) return(false);
if (preg_match('#^-#',$label)==1) return(false);
if (preg_match('#-$#',$label)==1) return(false);
//if (preg_match('#--#',$label)==1) return(false);
if (preg_match('#^[a-zA-Z0-9-]+$#',$label)!==1) return(false);
}
return(true);
}
//$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die();
?>