MastodonHelp/web/clitools/crawler.php

405 lines
16 KiB
PHP
Raw Normal View History

2020-10-13 08:21:26 +02:00
#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('N',"\n");
define('SNAME',basename(__FILE__));
define('FNAME',preg_replace('/\.[^.]*$/','',SNAME));
define('CHILD','getinstinfo.php');
2023-12-26 11:17:54 +01:00
define('LIBDP','/../lib');
2020-10-13 08:21:26 +02:00
2023-12-26 11:17:54 +01:00
require __DIR__.LIBDP.'/ght.php';
require __DIR__.LIBDP.'/grace.php';
2023-12-26 11:17:54 +01:00
require __DIR__.LIBDP.'/parsetime.php';
2020-10-13 08:21:26 +02:00
use function mysqli_real_escape_string as myesc;
2022-12-01 05:41:54 +01:00
2020-10-13 08:21:26 +02:00
declare(ticks=1);
if (function_exists('pcntl_signal')) {
function signalHandler($signal) {
//echo(N);
mexit('received signal «'.$signal.'», shutting down.'.N,0,true);
2020-10-13 08:21:26 +02:00
}
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
}
$msglevs=['Debug', 'Info', 'Warning', 'Error', 'None'];
$opts=[
'gracetime'=>$gracetime,
2022-12-27 15:39:46 +01:00
'poolsize'=>10,
2020-10-14 00:03:40 +02:00
'peersfp'=>null,
2020-10-14 08:37:41 +02:00
'dontrestore'=>false,
2022-12-05 21:18:58 +01:00
'ignorelock'=>false,
'minmsgimplev'=>1
];
2020-10-13 08:21:26 +02:00
$help='SYNOPSIS
'.SNAME.' [options]
DESCRIPTION
This script coordinates the parallel execution of a definable number of
'.CHILD.' processes “against” all the alive instances which are already
present in mastostarts database, plus optionally those listed in a
specifiable file (typically the output file from a peerscrawl.php run).
OPTIONS
-
Everything after a single dash will be passed to '.CHILD.' processes as is.
-g, --gracetime <time>
If an instance has not been responding for longer than this time, avoid
checking it. See section «TIME SPECIFICATION» below to see how to specify
time.
DEFAULT: '.ght($opts['gracetime'],null,0).'
-G, --graceline
Return the “graceline” (0:0:0 of today minus gracetime: see option above) in
unix time and local time, then exit.
-p, --peersfp <file>
Defines the path to a file containing a list of instances to consider in
addition to those which are already present in the database. Note that this
option is ignored if the script will restore a previous unfinished session.
-P, --poolsize <number>
The number of slots in the processes pool, that is the number of '.CHILD.'
processes the script will run in parallel. Note that this option is ignored
if the script will restore a previous unfinished session.
DEFAULT: '.$opts['poolsize'].'
-I, --ignorelock
Normally, if its lockfile exists, the script will exit with an error.
If this option is set, instead, the lockfile existence will be ignored.
Please check that the script is actually not running before using it.
-R, --dontrestore
If this option is set and «instances.job» and «status.job» files from
a previous unfinished session are present in the «run» subdirectory inside
the directory where the script resides, the script will ignore them and
start a new session; otherwise the script will restore the previous,
unfinished session.
-m, --minmsgimplev <«debug»|«info»|«warning»|«error»|«none»>
Defines the minimum “importance level” of messages to be written to the
text user interface. There are 4 “importance levels”, in this order of
importance: «debug», «info», «warning», «error».
Setting this option to any of these values will write to the text user
interface all the messages with the specified or a greater level; setting
it to the special value «none» will completely disable messages.
DEFAULT: '.lcfirst($msglevs[$opts['minmsgimplev']]).'
-h, --help
When this option is specified, the script will show this help text and exit.
TIME SPECIFICATION
An example is better than ~5148 words :-)
To specify 1 year, 6 months (made of 31 days), 2 weeks, 3 days, 5 hours,
7 minutes and 12 seconds you can use «1y,6M,2w,3d,5h,7m,12; but you can
also use «12s,7m,5h,3d,2w,6M,1, or even «18M,1w,1w,2d,1d,3h,2h,7m,12.
LICENSE
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under certain
conditions; see <http://www.gnu.org/licenses/> for details.'.N;
$childopts='';
2020-10-13 08:21:26 +02:00
for ($i=1; $i<$argc; $i++) {
if ($argv[$i]=='-') {
if ($i<$argc-1) {
2020-10-13 08:21:26 +02:00
$i++;
while ($i<$argc) {
$childopts.=' '.$argv[$i];
$i++;
}
} else {
eecho(2,'you have specified «-» as last argument...'.N);
2020-10-13 08:21:26 +02:00
}
} elseif ($argv[$i]=='-g' || $argv[$i]=='--gracetime') {
if ($i+1>=$argc || ($time=parsetime($argv[$i+1]))===false)
mexit('option «'.$argv[$i].'» requires a valid time specification as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['gracetime']=$time;
} elseif ($argv[$i]=='-G' || $argv[$i]=='--graceline') {
$graceline=getgraceline($opts['gracetime']);
echo 'Graceline: '.$graceline.' ('.date('Y-m-d H:i:s',$graceline).').'.N;
exit(0);
} elseif ($argv[$i]=='-p' || $argv[$i]=='--peersfp') {
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('option «'.$argv[$i].'» requires an existing and readable file as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['peersfp']=$argv[$i];
} elseif ($argv[$i]=='-P' || $argv[$i]=='--poolsize') {
if ($i+1>=$argc || preg_match('/\d+/',$argv[$i+1])!==1 || $argv[$i+1]+0<1)
mexit('option «'.$argv[$i].'» requires an integer number greater than 0 as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['poolsize']=$argv[$i]+0;
} elseif ($argv[$i]=='-R' || $argv[$i]=='--dontrestore') {
$opts['dontrestore']=true;
} elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') {
$opts['ignorelock']=true;
} elseif ($argv[$i]=='-m' || $argv[$i]=='--minmsgimplev') {
if ($i+1>=$argc || !in_array(ucfirst(strtolower($argv[$i+1])),$msglevs))
mexit('option «'.$argv[$i].'» requires a “message importance level” value as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['minmsimpglev']=array_search(ucfirst(strtolower($argv[$i])),$msglevs);
} elseif ($argv[$i]=='-h' || $argv[$i]=='--help') {
echo($help);
exit(0);
} else {
mexit('dont know how to interpret «'.$argv[$i].'» (you can read the help text using «-h» or «--help»).'.N,1,false);
2020-10-13 08:21:26 +02:00
}
}
$graceline=getgraceline($opts['gracetime']);
$rundirpath=__DIR__.'/run';
$lockfp=$rundirpath.'/'.FNAME.'.lock';
2022-05-06 06:29:19 +02:00
if (file_exists($lockfp) && !$opts['ignorelock']) {
eecho(3,'lock file «'.$lockfp.'» exists (if you are sure '.SNAME.' is not already running you can use option «-I» to force execution).'.N);
2022-12-01 05:41:54 +01:00
exit(1);
2020-10-13 08:21:26 +02:00
}
if (@touch($lockfp)===false) {
eecho(3,'could not touch file «'.$lockfp.'».'.N);
exit(1);
2020-10-13 08:21:26 +02:00
}
if (file_exists($rundirpath) && !is_dir($rundirpath))
mexit('«'.$rundirpath.'» is not a directory.'.N,1,false);
elseif (file_exists($rundirpath) && (!is_readable($rundirpath) || !is_writeable($rundirpath)))
mexit('«'.$rundirpath.'» is not readable and writeable.'.N,1,false);
elseif (!file_exists($rundirpath))
if (@mkdir($rundirpath)===false)
mexit('could not create directory «'.$rundirpath.'».'.N,1,false);
$instsjfp=$rundirpath.'/'.FNAME.'_instances.job';
$statusjfp=$rundirpath.'/'.FNAME.'_status.job';
(!$opts['dontrestore'] && file_exists($instsjfp) && file_exists($statusjfp)) ? $restore=true : $restore=false;
($restore) ? eecho(1,'--- restarting ---'.N) : eecho(1,'--- starting ---'.N);
if ($restore) {
eecho(0,'looks like previous session was interrupted, trying to restore it...'.N);
$insts=@file($instsjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($insts===false) mexit('could not open file «'.$instsjfp.'» for reading.'.N,1,true);
$cinsts=count($insts);
eecho(1,'loaded '.$cinsts.' hostnames from previous session file.'.N);
$buf=@file($statusjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($buf===false) mexit('could not open file «'.$statusjfp.'» for reading.'.N,1,true);
if (count($buf)<2) mexit('file «'.$statusjfp.'»: wrong format (1).'.N,1,true);
$buf[0]=explode("\t",$buf[0]);
if (count($buf[0])!=4 ||
preg_match('/^\d+$/',$buf[0][0])!==1 ||
preg_match('/^\d+$/',$buf[0][1])!==1 ||
preg_match('/^\d+(\.\d+)?$/',$buf[0][2])!==1 ||
preg_match('/^\d+$/',$buf[0][3])!==1)
mexit('file «'.$statusjfp.'»: wrong format (2).'.N,1,true);
$opts['poolsize']=$buf[0][0]+0;
$instk=$buf[0][1]+0;
$toff=$buf[0][2]+0;
$done=$buf[0][3]+0;
//eecho(0,'poolsize: '.$opts['poolsize'].'; instk: '.$instk.'; eta: '.$tet.'; done: '.$done.'.'.N);
for ($i=1; $i<count($buf); $i++) {
if (preg_match('/^\d+$/',$buf[$i])!==1) mexit('file «'.$statusjfp.'»: wrong format (3).'.N,1,true);
//eecho(0,$i.': '.$buf[$i].'.'.N);
$host=$insts[$buf[$i]+0];
eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
$descspecs=[ 0=>['pipe','r'], 1=>['file',$rundirpath.'/'.$host.'.stdout.log','w'], 2=>['file',$rundirpath.'/'.$host.'.stderr.log','w'] ];
$procs[]=['proc'=>proc_open(cmd($childopts,$host),$descspecs,$pipes[]), 'instk'=>$buf[$i]+0, 'host'=>$host, 'begts'=>microtime(true)];
2020-10-13 08:21:26 +02:00
}
eecho(1,'restored previous session.'.N);
} else {
$inifp=__DIR__.'/../conf/mustard.ini';
$iniarr=@parse_ini_file($inifp);
if ($iniarr===false) mexit('could not open config file «'.$inifp.'»'.N,1,true);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
catch (Exception $error) { mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); }
// for php versions < 8
if ($link===false) mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true);
try { $res=mysqli_set_charset($link,'utf8mb4'); }
catch (Exception $error) { mexit('could not set «utf8mb4» charset for MySQL: '.mysqli_error($link).'.'.N,1,true); }
// for php versions < 8
if ($res===false) mexit('could not set MySQL charset: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,1,true);
$insts=[];
eecho(0,'loading instances from the database...'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE (LastOkCheckTS IS NOT NULL AND LastOkCheckTS>='.$graceline.') OR (LastOkCheckTS IS NULL AND InsertTS>='.$graceline.')',__LINE__);
2020-10-13 08:21:26 +02:00
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$insts))
$insts[]=$row['URI'];
eecho(1,'loaded '.count($insts).' instances which responded at least once since '.date('Y-m-d H:i:s',$graceline).' from the database.'.N);
2020-10-13 08:21:26 +02:00
2020-10-13 17:48:55 +02:00
if (!is_null($opts['peersfp'])) {
eecho(0,'loading “dead” instances from the database...'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE LastOkCheckTS IS NULL OR LastOkCheckTS<'.$graceline,__LINE__);
$deadinsts=[];
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$deadinsts))
$deadinsts[]=$row['URI'];
eecho(0,'loaded '.count($deadinsts).' “dead” instances from the database.'.N);
eecho(0,'loading instances from «'.$opts['peersfp'].'»...'.N);
2020-10-13 17:48:55 +02:00
$peers=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($peers===false) mexit('could not open «'.$opts['peersfp'].'» for reading.'.N,1,true);
$i=0;
2020-10-13 17:48:55 +02:00
foreach ($peers as $pdom) {
if (!in_array($pdom,$insts)) {
if (!in_array($pdom,$deadinsts)) {
$i++;
$insts[]=$pdom;
} else {
eecho(0,'ignoring instance «'.$pdom.'» from peers file because its dead.'.N);
}
}
2020-10-13 17:48:55 +02:00
}
eecho(1,'loaded '.$i.' more instances from «'.$opts['peersfp'].'».'.N);
unset($deadinsts);
2020-10-13 08:21:26 +02:00
}
2020-10-13 17:48:55 +02:00
mysqli_close($link);
unset($link);
shuffle($insts);
$cinsts=count($insts);
eecho(1,$cinsts.' instances to be checked.'.N);
2020-10-13 08:21:26 +02:00
$instsf=@fopen($instsjfp,'w');
if ($instsf===false) mexit('could not open «'.$instsjfp.'» for writing.'.N,1,true);
foreach ($insts as $host) fwrite($instsf,$host.N);
2020-10-13 08:21:26 +02:00
fclose($instsf);
$toff=0;
$done=0;
$procs=[];
2020-10-13 08:21:26 +02:00
for ($instk=0; $instk<$opts['poolsize'] && $instk<$cinsts; $instk++) {
$host=$insts[$instk];
eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
$descspecs=[ 0=>['pipe','r'], 1=>['file',$rundirpath.'/'.$host.'.stdout.log','w'], 2=>['file',$rundirpath.'/'.$host.'.stderr.log','w'] ];
$procs[]=['proc'=>proc_open(cmd($childopts,$host),$descspecs,$pipes[]), 'instk'=>$instk, 'host'=>$host, 'begts'=>microtime(true)];
2020-10-13 08:21:26 +02:00
}
$instk--;
}
$tini=microtime(true);
$rundone=false;
do {
$now=microtime(true);
$tet=$now-$tini+$toff;
eecho(0,'[[[ CHECKING PROCESSES POOL ]]]'.N);
$somerun=false;
foreach ($procs as $key=>$proc) {
if (!is_null($proc) && is_resource($proc['proc'])) {
$pstat=proc_get_status($proc['proc']);
if (!$pstat['running']) {
fclose($pipes[$key][0]);
$rv=proc_close($procs[$key]['proc']);// this always returns -1, it seems it's a php bug, anyway i'm keeping it for now to stay on the safe side (?)
$done++;
$out='proc slot '.$key.': finished running on «'.$proc['host'].'» after '.ght($now-$proc['begts'],null,0).' (exit code: '.$pstat['exitcode'].')';
if ($instk<$cinsts-1) {
$instk++;
$host=$insts[$instk];
$descspecs=[ 0=>['pipe','r'], 1=>['file',$rundirpath.'/'.$host.'.stdout.log','w'], 2=>['file',$rundirpath.'/'.$host.'.stderr.log','w'] ];
$procs[$key]=['proc'=>proc_open(cmd($childopts,$host),$descspecs,$pipes[$key]), 'instk'=>$instk, 'host'=>$host, 'begts'=>$now];
$out.='; started a new process on «'.$host.'».'.N;
2020-10-13 08:21:26 +02:00
} else {
$out.='; no more hosts to check.'.N;
$procs[$key]=null;
2020-10-13 08:21:26 +02:00
}
eecho(1,$out);
2020-10-13 08:21:26 +02:00
} else {
eecho(0,'proc slot '.$key.': been running on «'.$proc['host'].'» for '.ght($now-$proc['begts'],null,0).'.'.N);
$somerun=true;
2020-10-13 08:21:26 +02:00
}
}
}
$out=$done.'/'.$cinsts.' ('.round(100/$cinsts*$done).'%); elapsed time: '.ght($tet,null,0);
if ($done>0) $out.='; estimated time remaining: '.ght($cinsts*$tet/$done-$tet,null,0);
eecho(1,$out.'.'.N);
if ($somerun) {
writestatus($statusjfp,$opts,$instk,$tet,$done,$procs);
sleep(1);
2020-10-13 08:21:26 +02:00
} else {
$rundone=true;
2020-10-13 08:21:26 +02:00
}
} while (!$rundone);
2020-10-13 08:21:26 +02:00
unlink($instsjfp);
unlink($statusjfp);
2020-10-14 08:37:41 +02:00
unlink($lockfp);
eecho(1,'done :-)'.N);
exit(0);
2020-10-13 08:21:26 +02:00
// functions
2020-10-13 08:21:26 +02:00
function writestatus(&$statusjfp,&$opts,&$instk,&$tet,&$done,&$procs) {
$f=@fopen($statusjfp,'w');
if ($f===false) mexit('could not open «'.$statusjfp.'» for writing.'.N,2,true);
fwrite($f,$opts['poolsize']."\t".$instk."\t".$tet."\t".$done.N);
foreach ($procs as $proc)
if (!is_null($proc))
fwrite($f,$proc['instk'].N);
fclose($f);
}
function cmd(&$childopts, &$host) {
return('exec '.__DIR__.'/'.CHILD.$childopts.' '.escapeshellarg($host));
}
function eecho($lev,$msg) {
global $opts, $msglevs;
$time=microtime(false);
$time=explode(' ',$time);
$time=date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2);
$msg=$time.' '.$msglevs[$lev].': '.$msg;
if ($lev>=$opts['minmsgimplev']) {
if ($lev<2)
echo($msg);
else
fwrite(STDERR,$msg);
}
}
function myq(&$link,$query,$line) {
try {
$res=mysqli_query($link,$query);
}
catch (Exception $error) {
mexit('query «'.$query.'» (line '.$line.') failed: '.$error->getMessage().N,3,true);
}
// for older php versions < 8, which seem to not catch mysql exceptions
if ($res===false) mexit('query «'.$query.'» (line '.$line.') failed: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,3,true);
return($res);
}
function mexit($msg,$code,$remlock) {
global $link, $lockfp;
if (isset($link) && $link!==false) mysqli_close($link);
if ($remlock && isset($lockfp) && is_file($lockfp)) unlink($lockfp);
if ($code!=0)
eecho(3,$msg);
else
eecho(1,$msg);
exit($code);
}
2020-10-13 08:21:26 +02:00
?>