Super-optimized it; made it a bit more verbose and clearer

This commit is contained in:
pezcurrel 2023-12-29 10:04:05 +01:00
parent c3d5050b42
commit 83868504d7

View file

@ -16,7 +16,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('N',"\n");
const N="\n";
define('SNAME',basename(__FILE__));
define('BNAME',preg_replace('/\.[^.]*$/','',SNAME));
@ -32,11 +32,11 @@ $opts=[
'inifp'=>__DIR__.'/../conf/mustard.ini',
'startinst'=>'mastodon.social',
'gracetime'=>$gracetime,
'peersfp'=>__DIR__.'/peers',
'cpeersfp'=>__DIR__.'/peers.checked',
'peersfp'=>__DIR__.'/peers.responding',
'ckpeersfp'=>__DIR__.'/peers.checked',
'excludefp'=>null,
'timeout'=>8,
'curltimeout'=>15,
'conntimeout'=>5,
'functimeout'=>10,
'loop'=>false,
'excludedead'=>true,
'ignorelock'=>false,
@ -45,8 +45,6 @@ $opts=[
$msglevs=['Debug', 'Info', 'Warning', 'Error', 'None'];
$ghtsa=[[' day',' days'],[' hour',' hours'],[' minute',' minutes'],[' second',' seconds']];
$help='SYNOPSIS
'.SNAME.' [options]
@ -70,7 +68,7 @@ OPTIONS
If an instance has not been responding for longer than this time, avoid
checking it. See section «TIME SPECIFICATION» below to see how to specify
time.
DEFAULT: '.ght($opts['gracetime'],$ghtsa).'
DEFAULT: '.ght($opts['gracetime'],null,0).'
-G, --graceline
Return the “graceline” (now - gracetime: see option above) in unix time and
local time, then exit.
@ -78,22 +76,22 @@ OPTIONS
Defines the file into which the ordered list of responding instances
will be saved.
DEFAULT: «'.$opts['peersfp'].'»
-c, --cpeersfp <file>
-c, --ckpeersfp <file>
Defines the file into which the ordered list of all checked instances will
be saved.
DEFAULT: «'.$opts['cpeersfp'].'»
DEFAULT: «'.$opts['ckpeersfp'].'»
-I, --ignorelock
Normally, if its lockfile exists, the program exits with an error before
doing anything. With this option the lockfile is ignored. Please verify
that the program is not already running before using it.
-t, --timeout <time>
-t, --conntimeout <time>
Defines the timeout in seconds for every connection attempt. See section
«TIME SPECIFICATION» below to see how to specify time.
DEFAULT: '.ght($opts['timeout'],$ghtsa).'
-T, --curltimeout <time>
DEFAULT: '.ght($opts['conntimeout'],null,0).'
-T, --functimeout <time>
Defines the timeout in seconds for every download. See section «TIME
SPECIFICATION» below to see how to specify time.
DEFAULT: '.ght($opts['curltimeout'],$ghtsa).'
DEFAULT: '.ght($opts['functimeout'],null,0).'
-m, --minmsgimplev <«debug»|«info»|«warning»|«error»|«none»>
Defines the minimum “importance level” of messages to be written to the text
user interface. There are 4 “importance levels”, in this order of
@ -132,11 +130,11 @@ for ($i=1; $i<$argc; $i++) {
mexit(3,'option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1,false);
$i++;
$opts['peersfp']=$argv[$i];
} elseif ($argv[$i]=='-c' || $argv[$i]=='--cpeersfp') {
} elseif ($argv[$i]=='-c' || $argv[$i]=='--ckpeersfp') {
if ($i+1>=$argc)
mexit(3,'option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1,false);
$i++;
$opts['cpeersfp']=$argv[$i];
$opts['ckpeersfp']=$argv[$i];
} elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') {
$opts['ignorelock']=true;
} elseif ($argv[$i]=='-e' || $argv[$i]=='--excludefp') {
@ -144,16 +142,16 @@ for ($i=1; $i<$argc; $i++) {
mexit(3,'option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1,false);
$i++;
$opts['excludefp']=$argv[$i];
} elseif ($argv[$i]=='-t' || $argv[$i]=='--timeout') {
} elseif ($argv[$i]=='-t' || $argv[$i]=='--conntimeout') {
if ($i+1>=$argc || ($time=parsetime($argv[$i+1]))===false)
mexit(3,'option «'.$argv[$i].'» requires a valid time specification as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['timeout']=$time;
} elseif ($argv[$i]=='-T' || $argv[$i]=='--curltimeout') {
$opts['conntimeout']=$time;
} elseif ($argv[$i]=='-T' || $argv[$i]=='--functimeout') {
if ($i+1>=$argc || ($time=parsetime($argv[$i+1]))===false)
mexit(3,'option «'.$argv[$i].'» requires a valid time specification as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['curltimeout']=$time;
$opts['functimeout']=$time;
} elseif ($argv[$i]=='-i' || $argv[$i]=='--includedead') {
$opts['excludedead']=false;
} elseif ($argv[$i]=='-m' || $argv[$i]=='--minmsgimplev') {
@ -183,6 +181,8 @@ pcntl_signal(SIGTERM,'sighandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'sighandler');// Terminal log-out
pcntl_signal(SIGINT,'sighandler');// Interrupted (Ctrl-C is pressed)
lecho(1,'###### Starting '.BNAME.' ######'.N);
$iniarr=@parse_ini_file($opts['inifp']);
if ($iniarr===false) mexit(3,'couldnt open «'.$opts['inifp'].'».'.N,1,true);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
@ -197,13 +197,14 @@ if ($res===false) mexit(3,'couldnt set «utf8mb4» charset for MySQL: '.mysql
$deadinsts=[];
if ($opts['excludedead']) {
$graceline=time()-$opts['gracetime'];
lecho(1,'loading dead instances from “Instances” and “Peers” table (gracetime: '.ght($opts['gracetime'],null,0).'; graceline: '.date('Y-m-d H:i:s',$graceline).').'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE LastOkCheckTS<'.$graceline.' OR (LastOkCheckTS IS NULL AND InsertTS<'.$graceline.')');
lecho(0,'got '.mysqli_num_rows($res).' dead instances from Instances table.'.N);
lecho(1,'got '.mysqli_num_rows($res).' dead instances from Instances table.'.N);
while ($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$deadinsts))
$deadinsts[]=$row['URI'];
$res=myq($link,'SELECT Hostname FROM Peers WHERE LastOkCheckTS<'.$graceline.' OR (LastOkCheckTS IS NULL AND FirstCheckTS<'.$graceline.')');
lecho(0,'got '.mysqli_num_rows($res).' dead instances from Peers table.'.N);
lecho(1,'got '.mysqli_num_rows($res).' dead instances from Peers table.'.N);
while ($row=mysqli_fetch_assoc($res))
if (!in_array($row['Hostname'],$deadinsts))
$deadinsts[]=$row['Hostname'];
@ -213,21 +214,20 @@ if ($opts['excludedead']) {
//mexit(0,'bau!'.N,0,true);
$insts=[];
$cinsts=[];
$ckinsts=[];
$exarr=[];
$notifs=[];
$maxround=1;
$newc=0;
$totnewc=0;
$tini=time();
$list=[$opts['startinst']];
// go
crawl($list,1);
lecho(1,'done crawling! :-)'.N);
$now=time();
lecho(1,'crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'; took '.ght($now-$tini).' in '.$maxround.' rounds; '.count($insts).' instance(s) responded; '.$newc.' new instances where found; max. memory usage: '.ghs(memory_get_peak_usage(true)).'.'.N);
lecho(1,'crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'; took '.ght($now-$tini,null,0).' in '.$maxround.' rounds; '.count($insts).' instances responded; '.count($ckinsts).' instances were considered; '.$totnewc.' new instances were found; max. memory usage: '.ghs(memory_get_peak_usage(true)).'.'.N);
sortcheckandsave($insts,'list of responding instances',$opts['peersfp']);
sortcheckandsave($cinsts,'list of checked instances',$opts['cpeersfp']);
sortcheckandsave($ckinsts,'list of checked instances',$opts['ckpeersfp']);
mysqli_close($link);
unlink($lockfp);
lecho(1,'done :-)'.N);
@ -237,51 +237,80 @@ exit(0);
// functions
function crawl(&$list,$id) {
global $insts, $deadinsts, $cinsts, $tini, $opts, $maxround, $newc, $link;
global $insts, $ckinsts, $deadinsts, $tini, $opts, $maxround, $totnewc, $link;
$newc=0;
lecho(1,'###### START OF ROUND '.$id.' ######'.N);
$clist=count($list);
lecho(1,'will check '.$clist.' instance(s).'.N);
$nlist=[];
$c=count($list);
$i=0;
$buf=[];
foreach ($list as $inst) {
$i++;
$whynot=[];
if (in_array($inst,$ckinsts)) {
$whynot[]='it has already been checked';
$inckinsts=true;
} else {
$inckinsts=false;
}
if (!validhostname($inst)) $whynot[]='its hostname is not valid';
if (ckexarr($inst)) $whynot[]='its hostname matches an exclusion regexp';
if ($opts['excludedead'] && in_array($inst,$deadinsts)) $whynot[]='its dead';
if (count($whynot)>0) {
lecho(0,'excluding instance «'.$inst.'» ('.$i.'/'.$clist.'): '.implode(', ',$whynot).'.'.N);
} else {
lecho(0,'including instance «'.$inst.'» ('.$i.'/'.$clist.').'.N);
$buf[]=$inst;
if (!$inckinsts) $ckinsts[]=$inst;
}
}
$cbuf=count($buf);
$list=$buf;
unset($buf);
if ($clist-$cbuf>0)
lecho(1,'excluded '.($clist-$cbuf).' instances; '.$cbuf.' instances remaining.'.N);
$clist=$cbuf;
$i=0;
$rtini=time();
foreach ($list as $inst) {
$responded=false;
$i++;
$now=time();
$rtela=$now-$rtini;
lecho(1,'working on «'.$inst.'»: round '.$id.', '.$i.'/'.$c.'; TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' instances responded; '.count($nlist).' instances in next round list; '.$newc.' new instance(s) found.'.N);
waituntilonline();
lecho(1,'round '.$id.': working on instance «'.$inst.'» ('.$i.'/'.$clist.').'.N);
updexarr();
lecho(1,'trying to load «'.$inst.s peers...'.N);
$peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout'],$opts['curltimeout']);
$cinsts[]=$inst;// don't need to check if in_array
waituntilonline();
lecho(1,'trying to load instance «'.$inst.s peers...'.N);
$peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['conntimeout'],$opts['functimeout']);
if ($peers['cont']===false) {
lecho(2,'could not load «'.$inst.s peers: '.$peers['emsg'].'.'.N);
lecho(2,'could not load instance «'.$inst.s peers: '.$peers['emsg'].'.'.N);
} else {
$peers=@json_decode($peers['cont'],true);
if (!is_array($peers)) {
lecho(2,'loading «'.$inst.s peers, got bad JSON.'.N);
lecho(2,'expecting instance «'.$inst.s peers, got bad JSON instead.'.N);
} else {
$cp=count($peers);
lecho(1,'successfully loaded «'.$inst.s peers ('.$cp.') :-)'.N);
$responded=true;
$cpeers=count($peers);
lecho(1,'successfully loaded instance «'.$inst.s peers ('.$cpeers.') :-)'.N);
$pi=1;
foreach ($peers as $key=>$peer) {
if ($key!=$pi-1) {
lecho(2,'«'.$inst.s peers: entity '.$pi.'/'.$cp.'s key is not sequential; not checking further.'.N);
lecho(2,'instance «'.$inst.s peers: entity '.$pi.'/'.$cpeers.'s key is not sequential: not checking further.'.N);
break;
} elseif (!is_string($peer)) {
lecho(2,'«'.$inst.s peers: entity '.$pi.'/'.$cp.' is not a string; not checking further.'.N);
lecho(2,'instance «'.$inst.s peers: entity '.$pi.'/'.$cpeers.' is not a string: not checking further.'.N);
break;
} else {
$whynot=[];
if (in_array($peer,$cinsts)) $whynot[]='it has already been checked';
if (!validhostname($peer)) $whynot[]='its name is not a valid hostname';
if (ckexarr($peer)) $whynot[]='its name matches an exclusion regexp';
if (in_array($peer,$ckinsts)) $whynot[]='it has already been checked or excluded';
if (!validhostname($peer)) $whynot[]='its hostname is not valid';
if (ckexarr($peer)) $whynot[]='its hostname matches an exclusion regexp';
if (in_array($peer,$list)) $whynot[]='it is already present in current list';
if (in_array($peer,$nlist)) $whynot[]='it has already been added to next round list';
if (in_array($peer,$nlist)) $whynot[]='it is already present in next round list';// this should never happen, but.
if ($opts['excludedead'] && in_array($peer,$deadinsts)) $whynot[]='its dead';
if (count($whynot)>0) {
lecho(0,'«'.$inst.'»: not adding peer «'.$peer.'» ('.$pi.'/'.$cp.') to next round list because '.implode(', ',$whynot).'.'.N);
lecho(0,'instance «'.$inst.'»: not adding peer «'.$peer.'» ('.$pi.'/'.$cpeers.') to next round list: '.implode(', ',$whynot).'.'.N);
} else {
lecho(1,'«'.$inst.'»: adding peer «'.$peer.'» ('.$pi.'/'.$cp.') to next round list :-)'.N);
lecho(1,'instance «'.$inst.'»: adding peer «'.$peer.'» ('.$pi.'/'.$cpeers.') to next round list :-)'.N);
$nlist[]=$peer;
}
}
@ -289,58 +318,74 @@ function crawl(&$list,$id) {
}
}
}
if (!ckexarr($inst)) {
$responded=false;
lecho(1,'trying to load «'.$inst.s instance info...'.N);
$instinfo=gurl('https://'.$inst.'/api/v1/instance',$opts['timeout'],$opts['curltimeout']);
if (!$responded) {
lecho(1,'instance «'.$inst.'» didnt respond at its “peers” endpoint; trying to load its info from “instance” endpoint...'.N);
$instinfo=gurl('https://'.$inst.'/api/v1/instance',$opts['conntimeout'],$opts['functimeout']);
if ($instinfo['cont']===false) {
lecho(2,'could not load «'.$inst.s instance info: '.$instinfo['emsg'].'.'.N);
lecho(2,'could not load instance «'.$inst.s info: '.$instinfo['emsg'].'.'.N);
} else {
$instinfo=@json_decode($instinfo['cont'],true);
if (is_array($instinfo))
$responded=true;
else
lecho(2,'loading «'.$inst.s instance info, got bad JSON.'.N);
lecho(2,'expecting instance «'.$inst.s info, got bad JSON instead.'.N);
}
if ($responded && !in_array($inst,$insts)) {
lecho(1,'instance «'.$inst.'» responded :-)'.N);
$insts[]=$inst;
$res=myq($link,'SELECT ID FROM Instances WHERE URI=\''.myesc($link,$inst).'\'');
if (mysqli_num_rows($res)==0) {
lecho(1,'instance «'.$inst.'» is new :-)'.N);
myq($link,'INSERT INTO Instances SET URI=\''.myesc($link,$inst).'\', InsertTS='.$now);
$newc++;
}
}
$res=myq($link,'SELECT * FROM Peers WHERE Hostname=\''.myesc($link,$inst).'\'');
$nrows=mysqli_num_rows($res);
if ($nrows>0) {
if ($nrows>1) lecho(2,'«'.$inst.'» has '.$nrows.' records in “Peers” table! :-('.N);
$row=mysqli_fetch_assoc($res);
if ($responded) myq($link,'UPDATE Peers SET LastOkCheckTS='.$now.' WHERE ID='.$row['ID']);
}
$now=time();
if ($responded) {
lecho(1,'instance «'.$inst.'» responded :-)'.N);
$insts[]=$inst;
$res=myq($link,'SELECT ID FROM Instances WHERE URI=\''.myesc($link,$inst).'\'');
$cres=mysqli_num_rows($res);
if ($cres<1) {
lecho(1,'instance «'.$inst.'» is new to “Instances” table, adding it :-)'.N);
myq($link,'INSERT INTO Instances SET URI=\''.myesc($link,$inst).'\', InsertTS='.$now);
$totnewc++;
$newc++;
} elseif ($cres>1) {
lecho(2,'instance «'.$inst.'» has '.$cres.' records in “Instances” table! :-('.N);
} else {
$query='INSERT INTO Peers SET Hostname=\''.myesc($link,$inst).'\', FirstCheckTS='.$now;
if ($responded) $query.=', LastOkCheckTS='.$now;
myq($link,$query);
lecho(1,'instance «'.$inst.'» is already present in “Instances” table.'.N);//+++
}
} else {
lecho(2,'ignoring instance «'.$inst.'» because it matches an exclusion regexp: wont add it to «Instances» and «Peers» tables.'.N);
lecho(1,'instance «'.$inst.'» didnt respond :-('.N);
}
$res=myq($link,'SELECT * FROM Peers WHERE Hostname=\''.myesc($link,$inst).'\'');
$cres=mysqli_num_rows($res);
if ($cres<1) {
lecho(1,'instance «'.$inst.'» is new to “Peers” table, adding it :-)'.N);
$query='INSERT INTO Peers SET Hostname=\''.myesc($link,$inst).'\', FirstCheckTS='.$now;
if ($responded) $query.=', LastOkCheckTS='.$now;
myq($link,$query);
} elseif ($cres>0) {
if ($cres>1) lecho(2,'«'.$inst.'» has '.$cres.' records in “Peers” table! :-('.N);
if ($responded) {
lecho(1,'instance «'.$inst.'» is already present in “Peers” table, but it responded: updating its records “LastOkCheckTS” value...'.N);
$row=mysqli_fetch_assoc($res);
myq($link,'UPDATE Peers SET LastOkCheckTS='.$now.' WHERE ID='.$row['ID']);
}
}
$now=time();
$rtela=$now-$rtini;
lecho(1,'round '.$id.': finished working on instance «'.$inst.'» ('.$i.'/'.$clist.'); RoundElapsedTime: '.ght($rtela,null,0).'; RoundEstimatedTimeRemaining: '.ght($rtela/$i*$clist-$rtela,null,0).'; RoundNewInsts: '.$newc.'; NextRoundInsts: '.count($nlist).'; TotElapsedTime: '.ght($now-$tini,null,0).'; TotConsideredInsts: '.count($ckinsts).'; TotRespondingInsts: '.count($insts).'; TotNewInsts: '.$totnewc.'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).').'.N);
}
unset($list);
if (count($nlist)>0) {
$now=time();
$rtela=$now-$rtini;
$cnlist=count($nlist);
lecho(1,'END OF ROUND STATS: RoundCheckedInsts: '.$clist.'; RoundElapsedTime: '.ght($rtela,null,0).'; RoundNewInsts: '.$newc.'; NextRoundInsts: '.$cnlist.'; TotElapsedTime: '.ght($now-$tini,null,0).'; TotConsideredInsts: '.count($ckinsts).'; TotRespondingInsts: '.count($insts).'; TotNewInsts: '.$totnewc.'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).').'.N);
if ($cnlist<1) lecho(1,'next round list is empty.'.N);
lecho(1,'###### END OF ROUND '.$id.' ######'.N);
if ($cnlist>0) {
crawl($nlist,$id+1);
if ($id+1>$maxround) $maxround=$id+1;
} else {
lecho(1,'next round list is empty.'.N);
}
lecho(1,'###### END OF ROUND '.$id.' ######'.N);
}
function mexit($lev,$msg,$code,$remlock) {
global $link, $insts, $cinsts, $lockfp, $opts;
global $link, $insts, $ckinsts, $lockfp, $opts;
if (isset($insts) && is_array($insts)) sortcheckandsave($insts,'list of responding instances',$opts['peersfp']);
if (isset($cinsts) && is_array($cinsts)) sortcheckandsave($cinsts,'list of checked instances',$opts['cpeersfp']);
if (isset($ckinsts) && is_array($ckinsts)) sortcheckandsave($ckinsts,'list of checked instances',$opts['ckpeersfp']);
if ($remlock && isset($lockfp) && is_file($lockfp)) unlink($lockfp);
lecho($lev,$msg);
exit($code);
@ -369,7 +414,7 @@ function myq(&$link,$query) {
function microdate($time=null) {
if (is_null($time)) $time=microtime(false);
$time=explode(' ',$time);
return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2));
return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2,-2));
}
function sortcheckandsave(&$arr,$arrdesc,&$fp) {