MastodonHelp/web/clitools/peerscrawl.php
2022-12-01 05:44:06 +01:00

452 lines
15 KiB
PHP
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
const N="\n";
require(__DIR__.'/../site/mustard/include/getfc.php');
setlocale(LC_ALL,getenv('LANG'));
$opts=array(
'inifp'=>__DIR__.'/../conf/mustard.ini',
'startinst'=>'mastodon.social',
'peersfp'=>__DIR__.'/peers',
'allpeersfp'=>__DIR__.'/peers.all',
'restore'=>false,
'excludefp'=>null,
'timeout'=>5,
'verbose'=>false,
'excludedead'=>false,
'timezone'=>date_default_timezone_get(),
'ignorelock'=>false
);
$help='peerscrawl.php
DESCRIPTION
This program tries to build a fairly complete list of fediverse instances
exposing the [instance]/api/v1/instance/peers endpoint.
SYNOPSIS
peerscrawl.php [options]
OPTIONS
-s, --startinst <domain>
Defines the first instance to crawl.
DEFAULT: «'.$opts['startinst'].
-p, --peersfp <file>
Defines the file into which the ordered list of responding instances
will be saved.
DEFAULT: «'.$opts['peersfp'].
-a, --allpeersfp <file>
Defines the file into which the ordered list of all checked instances
will be saved.
DEFAULT: «'.$opts['allpeersfp'].
-I, --ignorelock
Normally, if its lockfile exists, the program exits with an error before
doing anything. With this option the lockfile is ignored. Please verify
that the program is not already running before using it.
-r, --restore
If peers file already exists on programs start it will be loaded into
memory and each instance it contains will be considered “already
crawled”, thus allowing to “restore an interrupted crawling session”.
-e, --excludefp <file>
Defines a file containing exclusion rules: one regular expression per
line (empty lines are ignored). Any instance matching any defined regex
will be ignored by the program. Changes made to this file during program
execution will be taken into account.
-t, --timeout <seconds>
Defines the timeout in seconds for every connection attempt.
DEFAULT: «'.$opts['timeout'].
-T, --timezone <timezone identifier>
Defines the timezone for displaying localized values for dates and times.
DEFAULT on this system: «'.$opts['timezone'].
Note: if you want localized format as well set LANG environment variable.
-L, --tzlist
List all valid timezones and exit.
-v, --verbose
Be more verbose.
-d, --excludedead
Exclude instances marked as "Dead" in the database.
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under
certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
for ($i=1; $i<$argc; $i++) {
if (substr($argv[$i],0,1)=='-') {
switch($argv[$i]) {
case '-s':
case '--startinst':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
$i++;
$opts['startinst']=$argv[$i];
break;
case '-p':
case '--peersfp':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-a':
case '--allpeersfp':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['allpeersfp']=$argv[$i];
break;
case '-r':
case '--restore':
$opts['restore']=true;
break;
case '-I':
case '--ignorelock':
$opts['ignorelock']=true;
break;
case '-e':
case '--excludefp':
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable files path (use «-h» for more info).'.N,1);
$i++;
$opts['excludefp']=$argv[$i];
break;
case '-t':
case '--timeout':
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
mexit('Option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1);
$i++;
$opts['timeout']=$argv[$i]+0;
break;
case '-T':
case '--timezone':
if ($i+1>=$argc || !@date_default_timezone_set($argv[$i+1]))
mexit('Option «'.$argv[$i].'» has to be followed by a valid timezone identifier (use «-h» for more info).'.N,1);
$i++;
$opts['timezone']=$argv[$i];
break;
case '-L':
case '--tzlist':
$buf=timezone_identifiers_list();
foreach ($buf as $val)
gecho($val.N,false,false);
exit(0);
break;
case '-v':
case '--verbose':
$opts['verbose']=true;
break;
case '-d':
case '--excludedead':
$opts['excludedead']=true;
break;
case '-h':
case '--help':
mexit($help,0);
break;
default:
mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
break;
}
}
}
$lockfp=__DIR__.'/peerscrawl.lock';
if (file_exists($lockfp) && !$opts['ignorelock'])
mexit('Lockfile exists: it seems the program is already running; if youre sure its not true, use «-I» to force execution.'.N,2);
touch($lockfp);
function mexit($msg,$code) {
global $link, $lockfp;
if ($link) mysqli_close($link);
if (isset($lockfp) && file_exists($lockfp))
unlink($lockfp);
if ($code==0)
echo($msg);
else
fwrite(STDERR,$msg);
exit($code);
}
function gecho($msg,$prtime,$iserr) {
if ($prtime)
$msg=microdate().' '.$msg;
if ($iserr)
fwrite(STDERR,$msg);
else
echo($msg);
}
function microdate($time=null) {
if (is_null($time)) $time=microtime(false);
$time=explode(' ',$time);
return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2));
}
function sortcheckandsave(&$arr,$arrdesc,&$fp) {
$buc=count($arr);
$arr=array_unique($arr);
$auc=count($arr);
if ($buc!=$auc) gecho('WARNING: '.$arrdesc.' contained '.($buc-$auc).' duplicates, better check my code ;-)'.N,true,true);
gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false);
sort($arr);
$f=@fopen($fp,'w');
if ($f!==false) {
foreach ($arr as $val)
fwrite($f,$val.N);
fclose($f);
} else {
gecho('ERROR: couldnt open «'.$fp.'» for writing.'.N,true,true);
}
}
function shutdown($dosort) {
global $opts, $peersf, $allpeersf, $insts, $ainsts, $lockfp;
if ($peersf) @fclose($peersf);
if ($allpeersf) @fclose($allpeersf);
if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp);
if ($dosort) {
sortcheckandsave($insts,'list of responding instances',$opts['peersfp']);
sortcheckandsave($ainsts,'list of all checked instances',$opts['allpeersfp']);
}
}
// ATTENZIONE: se lo script viene pipato, tipo "script.php | tee script.log",
// la funzione viene eseguita, anche se l'output della stessa non viene mostrato
function signalHandler($signal) {
echo(N.'I got interrupted (signal: '.$signal.').'.N);
shutdown(false);
//touch('KILLED');
exit(3);
}
//declare(ticks=1);
pcntl_async_signals(true);
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
$deadinsts=array();
if ($opts['excludedead']) {
$iniarr=@parse_ini_file($opts['inifp'])
or mexit('ERROR: I couldnt open «'.$opts['inifp'].'».'.N,2);
$link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket'])
or mexit('ERROR: I couldnt connect to MySQL server: '.mysqli_connect_error().N,2);
mysqli_set_charset($link,'utf8mb4')
or mexit('ERROR trying to set MySQL client charset: '.__LINE__.': '.mysqli_error($link).N,2);
$res=mysqli_query($link,'SELECT URI FROM Instances WHERE Dead=1')
or mexit('ERROR: '.__LINE__.': '.mysqli_error($link).N,2);
mysqli_close($link);
while ($row=mysqli_fetch_assoc($res))
$deadinsts[]=$row['URI'];
unset($res);
}
/*$contextopts=array(
'http'=>array(
'timeout'=>$opts['timeout']
),
'socket'=>array(
'tcp_nodelay'=>true
)
);
$context=stream_context_create($contextopts);*/
$insts=array();
$ainsts=array();
$exarr=array();
if ($opts['restore']) {
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
gecho('Loading «'.$opts['peersfp'].'».'.N,true,false);
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
} else {
mexit('WARNING: I couldnt open «'.$opts['peersfp'].'» for reading.'.N,2);
}
}
$peersf=@fopen($opts['peersfp'],'w');
if (!$peersf) mexit('I could not open «'.$opts['peersfp'].'» in write mode.'.N,2);
$allpeersf=@fopen($opts['allpeersfp'],'w');
if (!$allpeersf) mexit('I could not open «'.$opts['allpeersfp'].'» in write mode.'.N,2);
function isempty($val) {
if (preg_match('/^\s*$/',$val)===1)
return(true);
else
return(false);
}
function waituntilonline() {
global $context;
$url='www.google.com';
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
gecho('WARNING: it seems we are offline :-('.N,true,true);
sleep(5);
}
fclose($f);
// gecho('It seems we are online! :-)'.N,true,false);
}
function updexarr() {
global $exarr, $opts;
if (!is_null($opts['excludefp'])) {
$f=@fopen($opts['excludefp'],'r');
if ($f!==false) {
$i=0;
$exarr=array();
while (!feof($f)) {
$i++;
$line=trim(fgets($f));
if (!isempty($line)) {
if (@preg_match($line,'foo')!==false)
$exarr[]=$line;
else
gecho('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N,true,true);
}
}
} else {
gecho('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N,true,true);
}
}
}
function ckexarr($inst) {
global $exarr;
foreach ($exarr as $re)
if (preg_match($re,$inst)===1) return(true);
return(false);
}
require(__DIR__.'/../site/mustard/include/ghs.php');
require(__DIR__.'/../site/mustard/include/ght.php');
function ismultibyte($s) {
preg_replace('/./u','.',$s,-1,$c);
(strlen($s)>$c) ? $r=true : $r=false;
return($r);
}
function validhostname($url) {
$hostname=preg_replace('#/.*#','',$url);
$hostname=preg_replace('#:[0-9]+$#','',$hostname);
if (ismultibyte($hostname)) $hostname=idn_to_ascii($hostname);
//echo($hostname.N);
if (strlen($hostname)>253) return(false);
$labels=explode('.',$hostname);
foreach($labels as $label) {
$len=strlen($label);
if ($len<1 || $len>63) return(false);
if (preg_match('#^-#',$label)==1) return(false);
if (preg_match('#-$#',$label)==1) return(false);
//if (preg_match('#--#',$label)==1) return(false);
if (preg_match('#^[a-zA-Z0-9-]+$#',$label)!==1) return(false);
}
return(true);
}
//$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die();
function crawl($list,$id) {
global $ainsts, $insts, $deadinsts, $peersf, $allpeersf, $opts, $tini;
gecho('~~~~~~~ START OF ROUND '.$id.' ~~~~~~~'.N,true,false);
waituntilonline();
updexarr();
foreach ($list as $inst) {
if (!in_array($inst,$ainsts)) {
$ainsts[]=$inst;
fwrite($allpeersf,$inst.N);
}
}
$nlist=array();
$c=count($list);
$i=0;
$rtini=time();
foreach ($list as $inst) {
$i++;
$now=time();
$rtela=$now-$rtini;
gecho('>>> '.$inst.N,true,false);
gecho('@@@ Round '.$id.', '.$i.'/'.$c.': TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' responding insts; '.count($nlist).' insts in next round list; '.count($ainsts).' total.'.N,true,false);
gecho('Trying to load «'.$inst.s peers...'.N,true,false);
$peers=getfc('https://'.$inst.'/api/v1/instance/peers',$opts['timeout']);
if ($peers['cont']===false) {
gecho('ERROR: '.$peers['emsg'].N,true,true);
} else {
$peers=@json_decode($peers['cont'],true);
if (!is_array($peers)) {
gecho('ERROR: $peers is not an array (its type is '.gettype($peers).').'.N,true,true);
} else {
gecho('LOADED!'.N,true,false);
if (in_array($inst,$insts)) {
gecho('NOTICE: «'.$inst.'» is not a new instance (it was already in $insts).'.N,true,false);
} else {
gecho('NEW INSTANCE FOUND: «'.$inst.'».'.N,true,false);
$insts[]=$inst;
fwrite($peersf,$inst.N);
}
foreach ($peers as $peer) {
if (!is_string($peer)) {
gecho(' ERROR: I wont add this peer to next round list because its name is not a string.'.N,true,true);
} elseif (!validhostname($peer)) {
gecho(' ERROR: I wont add «'.$peer.'» to next round list because its not a valid hostname.'.N,true,true);
} elseif (ckexarr($peer)) {
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,true);
} elseif (in_array($peer,$ainsts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $ainsts.'.N,true,false);
} elseif (in_array($peer,$nlist)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $nlist.'.N,true,false);
// questo qui sotto diventa ridondante ora che uso $ainsts e lo popolo a inizio funzione
/*} elseif (in_array($peer,$list)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $list.'.N,true,false);
// questo qui sotto è sempre stato ridondante
} elseif (in_array($peer,$insts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $insts.'.N,true,false);
}*/
} elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its DEAD.'.N,true,false);
} else {
//EVVIVA!
gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false);
$nlist[]=$peer;
}
}
}
}
}
if (count($nlist)>0) {
unset($list);
crawl($nlist,$id+1);
} else {
gecho('Next round list is empty.'.N,true,false);
}
gecho('~~~~~~~ END OF ROUND '.$id.' ~~~~~~~'.N,true,false);
}
$tini=time();
crawl(array($opts['startinst']),1);
gecho('DONE CRAWLING! :-)'.N,true,false);
shutdown(true);
$now=time();
gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false);
gecho(count($ainsts).' URIs checked in '.ght($now-$tini).'; '.count($insts).' responded. Max memory usage: '.ghs(memory_get_peak_usage(true)).N,true,false);
unlink($lockfp);
exit(0);
?>