Bir rewrite, made it shorter and hopefully a bit more readable

This commit is contained in:
pezcurrel 2022-12-21 22:07:05 +01:00
parent 732ea79480
commit 9316e686b9

View file

@ -16,25 +16,26 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
const N="\n";
define('N',"\n");
require(__DIR__.'/../site/mustard/include/gurl.php');
require(__DIR__.'/../site/mustard/include/ghs.php');
require(__DIR__.'/../site/mustard/include/ght.php');
setlocale(LC_ALL,getenv('LANG'));
$opts=array(
$opts=[
'inifp'=>__DIR__.'/../conf/mustard.ini',
'startinst'=>'mastodon.social',
'peersfp'=>__DIR__.'/peers',
'allpeersfp'=>__DIR__.'/peers.all',
'apeersfp'=>__DIR__.'/peers.all',
'cpeersfp'=>__DIR__.'/peers.checked',
'restore'=>false,
'excludefp'=>null,
'timeout'=>5,
'curltimeout'=>10,
'verbose'=>false,
'excludedead'=>false,
'timezone'=>date_default_timezone_get(),
'ignorelock'=>false
);
];
$help='peerscrawl.php
DESCRIPTION
@ -50,10 +51,14 @@ $help='peerscrawl.php
Defines the file into which the ordered list of responding instances
will be saved.
DEFAULT: «'.$opts['peersfp'].'»
-a, --allpeersfp <file>
Defines the file into which the ordered list of all checked instances
will be saved.
DEFAULT: «'.$opts['allpeersfp'].'»
-a, --apeersfp <file>
Defines the file into which the ordered list of all instances will
be saved.
DEFAULT: «'.$opts['apeersfp'].'»
-c, --cpeersfp <file>
Defines the file into which the ordered list of all checked instances will
be saved.
DEFAULT: «'.$opts['cpeersfp'].'»
-I, --ignorelock
Normally, if its lockfile exists, the program exits with an error before
doing anything. With this option the lockfile is ignored. Please verify
@ -70,12 +75,6 @@ $help='peerscrawl.php
-t, --timeout <seconds>
Defines the timeout in seconds for every connection attempt.
DEFAULT: «'.$opts['timeout'].'»
-T, --timezone <timezone identifier>
Defines the timezone for displaying localized values for dates and times.
DEFAULT on this system: «'.$opts['timezone'].'»
Note: if you want localized format as well set LANG environment variable.
-L, --tzlist
List all valid timezones and exit.
-E, --excludedead
Exclude instances marked as "Dead" in the database.
-v, --verbose
@ -93,23 +92,30 @@ for ($i=1; $i<$argc; $i++) {
case '-s':
case '--startinst':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
mexit('Error: option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
$i++;
$opts['startinst']=$argv[$i];
break;
case '-p':
case '--peersfp':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-a':
case '--allpeersfp':
case '--apeersfp':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['allpeersfp']=$argv[$i];
$opts['apeersfp']=$argv[$i];
break;
case '-c':
case '--cpeersfp':
if ($i+1>=$argc)
mexit('Error: option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['cpeersfp']=$argv[$i];
break;
case '-r':
case '--restore':
@ -122,31 +128,17 @@ for ($i=1; $i<$argc; $i++) {
case '-e':
case '--excludefp':
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable files path (use «-h» for more info).'.N,1);
mexit('Error: option «'.$argv[$i].'» has to be followed by an existing, readable files path (use «-h» for more info).'.N,1);
$i++;
$opts['excludefp']=$argv[$i];
break;
case '-t':
case '--timeout':
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
mexit('Option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1);
mexit('Error: option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1);
$i++;
$opts['timeout']=$argv[$i]+0;
break;
case '-T':
case '--timezone':
if ($i+1>=$argc || !@date_default_timezone_set($argv[$i+1]))
mexit('Option «'.$argv[$i].'» has to be followed by a valid timezone identifier (use «-h» for more info).'.N,1);
$i++;
$opts['timezone']=$argv[$i];
break;
case '-L':
case '--tzlist':
$buf=timezone_identifiers_list();
foreach ($buf as $val)
gecho($val.N,false,false);
exit(0);
break;
case '-E':
case '--excludedead':
$opts['excludedead']=true;
@ -160,7 +152,7 @@ for ($i=1; $i<$argc; $i++) {
mexit($help,0);
break;
default:
mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
mexit('Error: option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
break;
}
}
@ -173,6 +165,132 @@ if (is_file($lockfp) && !$opts['ignorelock']) {
}
touch($lockfp);
//declare(ticks=1);
pcntl_async_signals(true);
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
$deadinsts=[];
if ($opts['excludedead']) {
$iniarr=@parse_ini_file($opts['inifp'])
or mexit('Error: couldnt open «'.$opts['inifp'].'».'.N,1);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
catch (Exception $error) { mexit('Error: couldnt connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); }
// for php versions < 8
if ($link===false) mexit('Error: couldnt connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true);
try { $res=mysqli_set_charset($link,'utf8mb4'); }
catch (Exception $error) { mexit('Error: couldnt set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).'.'.N,1,true); }
// for php versions < 8
if ($res===false) mexit('Error: couldnt set «utf8mb4» charset for MySQL: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,1,true);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=1');
mysqli_close($link);
while ($row=mysqli_fetch_assoc($res))
$deadinsts[]=$row['URI'];
unset($res);
gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false);
}
$insts=[];
$ainsts=[];
$cinsts=[];
$exarr=[];
if ($opts['restore']) {
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
gecho('Loading «'.$opts['peersfp'].'».'.N,true,false);
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
} else {
mexit('Error: couldnt open «'.$opts['peersfp'].'» for reading.'.N,1);
}
}
$peersf=@fopen($opts['peersfp'],'w');
if (!$peersf) mexit('Error: couldnt open «'.$opts['peersfp'].'» in write mode.'.N,1);
$apeersf=@fopen($opts['apeersfp'],'w');
if (!$apeersf) mexit('Error: couldnt open «'.$opts['apeersfp'].'» in write mode.'.N,1);
$cpeersf=@fopen($opts['cpeersfp'],'w');
if (!$cpeersf) mexit('Error: couldnt open «'.$opts['cpeersfp'].'» in write mode.'.N,1);
$tini=time();
crawl([$opts['startinst']],1);
gecho('Done crawling! :-)'.N,true,false);
shutdown(true);
$now=time();
gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false);
gecho(count($ainsts).' URIs checked in '.ght($now-$tini).' ('.$maxround.' rounds); '.count($insts).' responded. Max. memory usage: '.ghs(memory_get_peak_usage(true)).'.'.N,true,false);
exit(0);
// functions
function crawl($list,$id) {
global $insts, $deadinsts, $cinsts, $ainsts, $tini, $opts, $peersf, $cpeersf, $apeersf, $maxround;
gecho('###### START OF ROUND '.$id.' ######'.N,true,false);
$nlist=[];
$c=count($list);
$i=0;
$rtini=time();
foreach ($list as $inst) {
if (!in_array($inst,$ainsts)) {
$ainsts[]=$inst;
fwrite($apeersf,$inst.N);
}
$i++;
$now=time();
$rtela=$now-$rtini;
gecho('Working on «'.$inst.'»: round '.$id.', '.$i.'/'.$c.'; TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' discovered instances; '.count($nlist).' instances in next round list.'.N,true,false);
waituntilonline();
updexarr();
gecho('Trying to load «'.$inst.s peers...'.N,true,false);
$peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout'],$opts['curltimeout']);
$cinsts[]=$inst;// don't need to chech if in_array
fwrite($cpeersf,$inst.N);
if ($peers['cont']===false) {
gecho('Error loading «'.$inst.s peers: '.$peers['emsg'].'.'.N,true,true);
} else {
$peers=@json_decode($peers['cont'],true);
if (!is_array($peers)) {
gecho('Error loading «'.$inst.s peers: got not good JSON.'.N,true,true);
} else {
gecho('Successfully loaded «'.$inst.s peers :-)'.N,true,false);
if (!in_array($inst,$insts)) {
gecho('Discovered instance «'.$inst.'» :-)'.N,true,false);
$insts[]=$inst;
fwrite($peersf,$inst.N);
}
foreach ($peers as $peer) {
if (!in_array($peer,$ainsts)) {
$ainsts[]=$peer;
fwrite($apeersf,$peer.N);
}
$whynot=[];
if (in_array($peer,$cinsts)) $whynot[]='it has already been checked';
if (!is_string($peer)) $whynot[]='its name is not a string';
if (!validhostname($peer)) $whynot[]='its name is not a valid hostname';
if (ckexarr($peer)) $whynot[]='its name matches an exclusion regexp';
if (in_array($peer,$list)) $whynot[]='it is already present in current list';
if (in_array($peer,$nlist)) $whynot[]='it has already been added to next round list';
if ($opts['excludedead'] && in_array($peer,$deadinsts)) $whynot[]='its dead';
if (count($whynot)>0) {
if ($opts['verbose']) gecho(' Not adding peer «'.$peer.'» to next round list because '.implode(', ',$whynot).'.'.N,true,true);
} else {
if ($opts['verbose']) gecho(' Adding peer «'.$peer.'» to next round list :-)'.N,true,false);
$nlist[]=$peer;
}
}
}
}
}
if (count($nlist)>0) {
unset($list);
crawl($nlist,$id+1);
$maxround=$id+1;
} else {
gecho('Next round list is empty.'.N,true,false);
}
gecho('###### END OF ROUND '.$id.' ######'.N,true,false);
}
function mexit($msg,$code) {
global $link, $lockfp;
if (isset($link) && $link!==false) mysqli_close($link);
@ -194,18 +312,13 @@ function gecho($msg,$prtime,$iserr) {
}
function myq(&$link,$query) {
try {
$res=mysqli_query($link,$query);
}
catch (Exception $error) {
mexit('Query «'.$query.'» failed: '.$error->getMessage().' ('.$error->getCode().').'.N,2);
}
try { $res=mysqli_query($link,$query); }
catch (Exception $error) { mexit('Error: query «'.$query.'» failed: '.$error->getMessage().' ('.$error->getCode().').'.N,2); }
// for php versions < 8, which seem to not catch mysql exceptions
if ($res===false) mexit('Query «'.$query.'» failed: '.mysqli_error($link).' ('.mysqli_errno($link).').'.$eol);
if ($res===false) mexit('Error: query «'.$query.'» failed: '.mysqli_error($link).' ('.mysqli_errno($link).').'.N,2);
return($res);
}
function microdate($time=null) {
if (is_null($time)) $time=microtime(false);
$time=explode(' ',$time);
@ -216,7 +329,7 @@ function sortcheckandsave(&$arr,$arrdesc,&$fp) {
$buc=count($arr);
$arr=array_unique($arr);
$auc=count($arr);
if ($buc!=$auc) gecho('WARNING: '.$arrdesc.' contained '.($buc-$auc).' duplicates, better check my code ;-)'.N,true,true);
if ($buc!=$auc) gecho('Warning: '.$arrdesc.' contained duplicates, better check my code ;-)'.N,true,true);
gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false);
sort($arr);
$f=@fopen($fp,'w');
@ -225,71 +338,31 @@ function sortcheckandsave(&$arr,$arrdesc,&$fp) {
fwrite($f,$val.N);
fclose($f);
} else {
gecho('ERROR: couldnt open «'.$fp.'» for writing.'.N,true,true);
gecho('Error: couldnt open «'.$fp.'» for writing.'.N,true,true);
}
}
function shutdown($dosort) {
global $opts, $peersf, $allpeersf, $insts, $ainsts, $lockfp;
global $opts, $peersf, $apeersf, $insts, $ainsts, $lockfp;
if (isset($peersf) && $peersf!==false) @fclose($peersf);
if (isset($allpeersf) && $allpeersf!==false) @fclose($allpeersf);
if (isset($apeersf) && $apeersf!==false) @fclose($apeersf);
if (isset($cpeersf) && $cpeersf!==false) @fclose($apeersf);
if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp);
if ($dosort) {
sortcheckandsave($insts,'list of responding instances',$opts['peersfp']);
sortcheckandsave($ainsts,'list of all checked instances',$opts['allpeersfp']);
sortcheckandsave($cinsts,'list of checked instances',$opts['cpeersfp']);
sortcheckandsave($ainsts,'list of all instances',$opts['apeersfp']);
}
}
// ATTENZIONE: se lo script viene pipato, tipo "script.php | tee script.log",
// la funzione viene eseguita, anche se l'output della stessa non viene mostrato
function signalHandler($signal) {
echo(N.'I got interrupted (signal: '.$signal.').'.N);
echo(N.'Interrupted (signal: '.$signal.').'.N);
shutdown(false);
//touch('KILLED');
exit(3);
}
//declare(ticks=1);
pcntl_async_signals(true);
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
$deadinsts=array();
if ($opts['excludedead']) {
$iniarr=@parse_ini_file($opts['inifp'])
or mexit('ERROR: I couldnt open «'.$opts['inifp'].'».'.N,1);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
catch (Exception $error) { mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); }
// for php versions < 8
if ($link===false) mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true);
try { $res=mysqli_set_charset($link,'utf8mb4'); }
catch (Exception $error) { mexit('could not set «utf8mb4» charset for MySQL: '.mysqli_error($link).'.'.N,1,true); }
// for php versions < 8
if ($res===false) mexit('could not set MySQL charset: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,1,true);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=1');
mysqli_close($link);
while ($row=mysqli_fetch_assoc($res))
$deadinsts[]=$row['URI'];
unset($res);
gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false);
}
$insts=array();
$ainsts=array();
$exarr=array();
if ($opts['restore']) {
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
gecho('Loading «'.$opts['peersfp'].'».'.N,true,false);
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
} else {
mexit('WARNING: I couldnt open «'.$opts['peersfp'].'» for reading.'.N,1);
}
}
$peersf=@fopen($opts['peersfp'],'w');
if (!$peersf) mexit('I could not open «'.$opts['peersfp'].'» in write mode.'.N,1);
$allpeersf=@fopen($opts['allpeersfp'],'w');
if (!$allpeersf) mexit('I could not open «'.$opts['allpeersfp'].'» in write mode.'.N,1);
function isempty($val) {
if (preg_match('/^\s*$/',$val)===1)
@ -300,12 +373,14 @@ function isempty($val) {
function waituntilonline() {
$url='www.google.com';
$gotoff=false;
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
gecho('WARNING: it seems we are offline :-('.N,true,true);
sleep(5);
$gotoff=true;
gecho('Warning: it seems we are offline, waiting 30 seconds before retrying :-('.N,true,true);
sleep(30);
}
fclose($f);
// gecho('It seems we are online! :-)'.N,true,false);
if ($gotoff) gecho('It seems we are back online! :-)'.N,true,false);
}
function updexarr() {
@ -314,7 +389,7 @@ function updexarr() {
$f=@fopen($opts['excludefp'],'r');
if ($f!==false) {
$i=0;
$exarr=array();
$exarr=[];
while (!feof($f)) {
$i++;
$line=trim(fgets($f));
@ -338,9 +413,6 @@ function ckexarr($inst) {
return(false);
}
require(__DIR__.'/../site/mustard/include/ghs.php');
require(__DIR__.'/../site/mustard/include/ght.php');
function ismultibyte($s) {
preg_replace('/./u','.',$s,-1,$c);
(strlen($s)>$c) ? $r=true : $r=false;
@ -366,93 +438,4 @@ function validhostname($hostname) {
}
//$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die();
function crawl($list,$id) {
global $ainsts, $insts, $deadinsts, $peersf, $allpeersf, $opts, $tini;
gecho('~~~~~~~ START OF ROUND '.$id.' ~~~~~~~'.N,true,false);
waituntilonline();
updexarr();
foreach ($list as $inst) {
if (!in_array($inst,$ainsts)) {
$ainsts[]=$inst;
fwrite($allpeersf,$inst.N);
}
}
$nlist=array();
$c=count($list);
$i=0;
$rtini=time();
foreach ($list as $inst) {
$i++;
$now=time();
$rtela=$now-$rtini;
gecho('>>> '.$inst.N,true,false);
gecho('@@@ Round '.$id.', '.$i.'/'.$c.': TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' responding insts; '.count($nlist).' insts in next round list; '.count($ainsts).' total.'.N,true,false);
gecho('Trying to load «'.$inst.s peers...'.N,true,false);
$peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout']);
if ($peers['cont']===false) {
gecho('ERROR: '.$peers['emsg'].N,true,true);
} else {
$peers=@json_decode($peers['cont'],true);
if (!is_array($peers)) {
gecho('ERROR: $peers is not an array (its type is '.gettype($peers).').'.N,true,true);
} else {
gecho('LOADED!'.N,true,false);
if (in_array($inst,$insts)) {
gecho('NOTICE: «'.$inst.'» is not a new instance (it was already in $insts).'.N,true,false);
} else {
gecho('NEW INSTANCE FOUND: «'.$inst.'».'.N,true,false);
$insts[]=$inst;
fwrite($peersf,$inst.N);
}
foreach ($peers as $peer) {
if (!is_string($peer)) {
gecho(' ERROR: I wont add this peer to next round list because its name is not a string.'.N,true,true);
} elseif (!validhostname($peer)) {
gecho(' ERROR: I wont add «'.$peer.'» to next round list because its not a valid hostname.'.N,true,true);
} elseif (ckexarr($peer)) {
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,false);
} elseif (in_array($peer,$ainsts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $ainsts.'.N,true,false);
} elseif (in_array($peer,$nlist)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $nlist.'.N,true,false);
// questo qui sotto diventa ridondante ora che uso $ainsts e lo popolo a inizio funzione
/*} elseif (in_array($peer,$list)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $list.'.N,true,false);
// questo qui sotto è sempre stato ridondante
} elseif (in_array($peer,$insts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $insts.'.N,true,false);
}*/
} elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) {
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its DEAD.'.N,true,false);
} else {
//EVVIVA!
gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false);
$nlist[]=$peer;
}
}
}
}
}
if (count($nlist)>0) {
unset($list);
crawl($nlist,$id+1);
} else {
gecho('Next round list is empty.'.N,true,false);
}
gecho('~~~~~~~ END OF ROUND '.$id.' ~~~~~~~'.N,true,false);
}
$tini=time();
crawl(array($opts['startinst']),1);
gecho('DONE CRAWLING! :-)'.N,true,false);
shutdown(true);
$now=time();
gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false);
gecho(count($ainsts).' URIs checked in '.ght($now-$tini).'; '.count($insts).' responded. Max memory usage: '.ghs(memory_get_peak_usage(true)).N,true,false);
exit(0);
?>