MastodonHelp/web/clitools/crawler.php

402 lines
15 KiB
PHP
Raw Normal View History

2020-10-13 08:21:26 +02:00
#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('N',"\n");
define('SNAME',basename(__FILE__));
define('FNAME',preg_replace('/\.[^.]*$/','',SNAME));
define('CHILD','getinstinfo.php');
define('LIBDP','/../site/mustard/include');
2020-10-13 08:21:26 +02:00
require(__DIR__.LIBDP.'/ght.php');
2020-10-13 08:21:26 +02:00
use function mysqli_real_escape_string as myesc;
2022-12-01 05:41:54 +01:00
2020-10-13 08:21:26 +02:00
declare(ticks=1);
if (function_exists('pcntl_signal')) {
function signalHandler($signal) {
2022-12-01 05:41:54 +01:00
echo(N);
mexit('received signal «'.$signal.'», shutting down.'.N,0,true);
2020-10-13 08:21:26 +02:00
}
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
}
$msglevs=['debug', 'info', 'warning', 'error', 'none'];
$opts=[
'poolsize'=>20,
'moreclauses'=>'',
2020-10-14 00:03:40 +02:00
'peersfp'=>null,
2020-10-14 08:37:41 +02:00
'dontrestore'=>false,
2022-12-05 21:18:58 +01:00
'ignorelock'=>false,
'logminmsglev'=>1,
'tuiminmsglev'=>1
];
2020-10-13 08:21:26 +02:00
$ghtsa=[[' day',' days'],[' hour',' hours'],[' minute',' minutes'],[' second',' seconds']];
$help='SYNOPSIS
'.SNAME.' [options]
DESCRIPTION
This script coordinates the parallel execution of a definable number of
'.CHILD.' processes “against” all the alive instances which are already
present in mastostarts database, plus optionally those listed in a
specifiable file (typically the output file from a peerscrawl.php run).
OPTIONS
-
Everything after a single dash will be passed to '.CHILD.' processes as is.
-p, --peersfp <file>
Defines the path to a file containing a list of instances to consider in
addition to those which are already present in the database. Note that this
option is ignored if the script will restore a previous unfinished session.
-P, --poolsize <number>
The number of slots in the processes pool, that is the number of '.CHILD.'
processes the script will run in parallel. Note that this option is ignored
if the script will restore a previous unfinished session.
DEFAULT: '.$opts['poolsize'].'
-I, --ignorelock
Normally, if its lockfile exists, the script will exit with an error.
If this option is set, instead, the lockfile existence will be ignored.
Please check that the script is actually not running before using it.
-R, --dontrestore
If this option is set and «instances.job» and «status.job» files from
a previous unfinished session are present in the «run» subdirectory inside
the directory where the script resides, the script will ignore them and
start a new session; otherwise the script will restore the previous,
unfinished session.
-m, --moreclauses <more SQL clauses>
If this option is set, whatever one writes as argument to the option will
be added to the main query for instances records, which is «SELECT URI FROM
Instances WHERE Dead=0».
-L, --logminmsglev <«debug»|«info»|«warning»|«error»|«none»>
Defines the minimum “importance level” of messages to be written into the
log file «run/[instance hostname].log». There are 4 “importance levels”, in
this order of importance: «debug», «info», «warning», «error».
Setting this option to any of these values will write into the logfile all
the messages with the specified or a greater level; setting it to the
special value «none» will completely disable logging to file.
DEFAULT: '.$msglevs[$opts['logminmsglev']].'
-T, --tuiminmsglev <«debug»|«info»|«warning»|«error»|«none»>
Defines the minimum “importance level” of messages to be written to the
terminal. See the option above to understand how this works.
DEFAULT: '.$msglevs[$opts['tuiminmsglev']].'
-h, --help
When this option is specified, the script will show this help text and exit.
LICENSE
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under certain
conditions; see <http://www.gnu.org/licenses/> for details.'.N;
$childopts='';
2020-10-13 08:21:26 +02:00
for ($i=1; $i<$argc; $i++) {
if ($argv[$i]=='-') {
if ($i<$argc-1) {
2020-10-13 08:21:26 +02:00
$i++;
while ($i<$argc) {
$childopts.=' '.$argv[$i];
$i++;
}
} else {
eecho(2,'you have specified «-» as last argument...'.N);
2020-10-13 08:21:26 +02:00
}
} elseif ($argv[$i]=='-p' || $argv[$i]=='--peersfp') {
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('option «'.$argv[$i].'» requires an existing and readable file as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['peersfp']=$argv[$i];
} elseif ($argv[$i]=='-P' || $argv[$i]=='--poolsize') {
if ($i+1>=$argc || preg_match('/\d+/',$argv[$i+1])!==1 || $argv[$i+1]+0<1)
mexit('option «'.$argv[$i].'» requires an integer number greater than 0 as an argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['poolsize']=$argv[$i]+0;
} elseif ($argv[$i]=='-R' || $argv[$i]=='--dontrestore') {
$opts['dontrestore']=true;
} elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') {
$opts['ignorelock']=true;
} elseif ($argv[$i]=='-m' || $argv[$i]=='--moreclauses') {
if ($i+1>=$argc)
mexit('option «'.$argv[$i].'» requires some SQL clause as argument (use «-h» to read help).'.N,1,false);
$i++;
$opts['moreclauses']=$argv[$i];
} elseif ($argv[$i]=='-L' || $argv[$i]=='--logminmsglev') {
if ($i+1>=$argc || !in_array(strtolower($argv[$i+1]),$msglevs))
mexit('option «'.$argv[$i].'» requires a “log level” value as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['logminmsglev']=array_search(strtolower($argv[$i]),$msglevs);
} elseif ($argv[$i]=='-T' || $argv[$i]=='--tuiminmsglev') {
if ($i+1>=$argc || !in_array(strtolower($argv[$i+1]),$msglevs))
mexit('option «'.$argv[$i].'» requires a “log level” value as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['tuiminmsglev']=array_search(strtolower($argv[$i]),$msglevs);
} elseif ($argv[$i]=='-h' || $argv[$i]=='--help') {
echo($help);
exit(0);
} else {
mexit('dont know how to interpret «'.$argv[$i].'» (you can read the help text using «-h» or «--help»).'.N,1,false);
2020-10-13 08:21:26 +02:00
}
}
foreach ($msglevs as $key=>$val) $msglevs[$key]=ucfirst($val);
2020-10-13 08:21:26 +02:00
$rundirpath=__DIR__.'/run';
$lockfp=$rundirpath.'/'.FNAME.'.lock';
2022-05-06 06:29:19 +02:00
if (file_exists($lockfp) && !$opts['ignorelock']) {
eecho(3,'lock file «'.$lockfp.'» exists (if you are sure '.SNAME.' is not already running you can use option «-I» to force execution).'.N);
2022-12-01 05:41:54 +01:00
exit(1);
2020-10-13 08:21:26 +02:00
}
if (@touch($lockfp)===false) {
eecho(3,'could not touch file «'.$lockfp.'».'.N);
exit(1);
2020-10-13 08:21:26 +02:00
}
if (file_exists($rundirpath) && !is_dir($rundirpath))
mexit('«'.$rundirpath.'» is not a directory.'.N,1,false);
elseif (file_exists($rundirpath) && (!is_readable($rundirpath) || !is_writeable($rundirpath)))
mexit('«'.$rundirpath.'» is not readable and writeable.'.N,1,false);
elseif (!file_exists($rundirpath))
if (@mkdir($rundirpath)===false)
mexit('could not create directory «'.$rundirpath.'».'.N,1,false);
$instsjfp=$rundirpath.'/instances.job';
$statusjfp=$rundirpath.'/status.job';
(!$opts['dontrestore'] && file_exists($instsjfp) && file_exists($statusjfp)) ? $restore=true : $restore=false;
$logfp=$rundirpath.'/'.FNAME.'.log';
($restore) ? $mode='a' : $mode='w';
$logf=fopen($logfp,$mode);
if ($logf===false) mexit('could not open log file «'.$logfp.'» for writing.'.N,1,true);
($restore) ? eecho(1,'--- restarting ---'.N) : eecho(1,'--- starting ---'.N);
$cmd=__DIR__.'/'.CHILD.$childopts;
eecho(1,'base command: «'.$cmd.'».'.N);
if ($restore) {
eecho(0,'looks like previous session was interrupted, trying to restore it...'.N);
$insts=@file($instsjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($insts===false) mexit('could not open file «'.$instsjfp.'» for reading.'.N,1,true);
$cinsts=count($insts);
eecho(1,'loaded '.$cinsts.' hostnames from previous session file.'.N);
$buf=@file($statusjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($buf===false) mexit('could not open file «'.$statusjfp.'» for reading.'.N,1,true);
if (count($buf)<2) mexit('file «'.$statusjfp.'»: wrong format (1).'.N,1,true);
$buf[0]=explode("\t",$buf[0]);
if (count($buf[0])!=4 ||
preg_match('/^\d+$/',$buf[0][0])!==1 ||
preg_match('/^\d+$/',$buf[0][1])!==1 ||
preg_match('/^\d+(\.\d+)?$/',$buf[0][2])!==1 ||
preg_match('/^\d+$/',$buf[0][3])!==1)
mexit('file «'.$statusjfp.'»: wrong format (2).'.N,1,true);
$opts['poolsize']=$buf[0][0]+0;
$instk=$buf[0][1]+0;
$toff=$buf[0][2]+0;
$done=$buf[0][3]+0;
//eecho(0,'poolsize: '.$opts['poolsize'].'; instk: '.$instk.'; eta: '.$eta.'; done: '.$done.'.'.N);
for ($i=1; $i<count($buf); $i++) {
if (preg_match('/^\d+$/',$buf[$i])!==1) mexit('file «'.$statusjfp.'»: wrong format (3).'.N,1,true);
//eecho(0,$i.': '.$buf[$i].'.'.N);
$host=$insts[$buf[$i]+0];
eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
$procs[]=['proc'=>proc_open($cmd.' '.escapeshellarg($host).' &>/dev/null',[],$pipes[]), 'instk'=>$buf[$i]+0, 'host'=>$host, 'begts'=>microtime(true)];
2020-10-13 08:21:26 +02:00
}
eecho(1,'restored previous session.'.N);
} else {
$inifp=__DIR__.'/../conf/mustard.ini';
$iniarr=@parse_ini_file($inifp);
if ($iniarr===false) mexit('could not open config file «'.$inifp.'»'.N,1,true);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
catch (Exception $error) { mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); }
// for php versions < 8
if ($link===false) mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true);
try { $res=mysqli_set_charset($link,'utf8mb4'); }
catch (Exception $error) { mexit('could not set «utf8mb4» charset for MySQL: '.mysqli_error($link).'.'.N,1,true); }
// for php versions < 8
if ($res===false) mexit('could not set MySQL charset: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,1,true);
$insts=[];
eecho(0,'loading known, alive instances from the database...'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=0'.$opts['moreclauses'],__LINE__);
2020-10-13 08:21:26 +02:00
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$insts))
$insts[]=$row['URI'];
eecho(1,'loaded '.count($insts).' known, alive instances from the database.'.N);
2020-10-13 08:21:26 +02:00
mysqli_close($link);
unset($link);
2020-10-13 08:21:26 +02:00
2020-10-13 17:48:55 +02:00
if (!is_null($opts['peersfp'])) {
eecho(0,'loading dead instances from the database...'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=1',__LINE__);
$deadinsts=[];
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$deadinsts))
$deadinsts[]=$row['URI'];
eecho(1,'loaded '.count($deadinsts).' dead instances from the database.'.N);
eecho(0,'loading instances from «'.$opts['peersfp'].'»...'.N);
2020-10-13 17:48:55 +02:00
$peers=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($peers===false) mexit('could not open «'.$opts['peersfp'].'» for reading.'.N,1,true);
$i=0;
2020-10-13 17:48:55 +02:00
foreach ($peers as $pdom) {
if (!in_array($pdom,$insts)) {
if (!in_array($pdom,$deadinsts)) {
$i++;
$insts[]=$pdom;
} else {
eecho(1,'ignoring instance «'.$pdom.'» from peers file because its dead.'.N);
}
}
2020-10-13 17:48:55 +02:00
}
eecho(1,'loaded '.$i.' more instances from «'.$opts['peersfp'].'».'.N);
2020-10-13 08:21:26 +02:00
}
2020-10-13 17:48:55 +02:00
2022-12-11 23:29:51 +01:00
unset($deadinsts);
shuffle($insts);
$cinsts=count($insts);
eecho(1,$cinsts.' instances to be checked.'.N);
2020-10-13 08:21:26 +02:00
$instsf=@fopen($instsjfp,'w');
if ($instsf===false) mexit('could not open «'.$instsjfp.'» for writing.'.N,1,true);
foreach ($insts as $host) fwrite($instsf,$host.N);
2020-10-13 08:21:26 +02:00
fclose($instsf);
$toff=0;
$done=0;
$procs=[];
2020-10-13 08:21:26 +02:00
for ($instk=0; $instk<$opts['poolsize'] && $instk<$cinsts; $instk++) {
$host=$insts[$instk];
eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
$procs[]=['proc'=>proc_open($cmd.' '.escapeshellarg($host).' &>/dev/null',[],$pipes[]), 'instk'=>$instk, 'host'=>$host, 'begts'=>microtime(true)];
2020-10-13 08:21:26 +02:00
}
$instk--;
}
$tini=microtime(true);
$rundone=false;
do {
$now=microtime(true);
$eta=$now-$tini+$toff;
eecho(0,'[[[ CHECKING PROCESSES POOL ]]]'.N);
$somerun=false;
foreach ($procs as $key=>$proc) {
if (!is_null($proc)) {
$pstat=proc_get_status($proc['proc']);
if (!$pstat['running']) {
$done++;
$out='proc slot '.$key.': finished running on «'.$proc['host'].'» (exit code: '.$pstat['exitcode'].')';
if ($instk<$cinsts-1) {
$instk++;
$host=$insts[$instk];
$procs[$key]=['proc'=>proc_open($cmd.' '.escapeshellarg($host).' &>/dev/null',[],$pipes[$key]), 'instk'=>$instk, 'host'=>$host, 'begts'=>$now];
$out.='; started a new process on «'.$host.'».'.N;
2020-10-13 08:21:26 +02:00
} else {
$out.='; no more hosts to check.'.N;
$procs[$key]=null;
2020-10-13 08:21:26 +02:00
}
eecho(1,$out);
2020-10-13 08:21:26 +02:00
} else {
eecho(0,'proc slot '.$key.': been running on «'.$proc['host'].'» for '.ght($now-$proc['begts']).'.'.N);
$somerun=true;
2020-10-13 08:21:26 +02:00
}
}
}
$out=$done.'/'.$cinsts.' ('.round(100/$cinsts*$done).'%); elapsed time: '.ght($eta);
if ($done>0) $out.='; estimated time remaining: '.ght($cinsts*$eta/$done-$eta);
eecho(1,$out.'.'.N);
if ($somerun) {
writestatus($statusjfp,$opts,$instk,$eta,$done,$procs);
sleep(1);
2020-10-13 08:21:26 +02:00
} else {
$rundone=true;
2020-10-13 08:21:26 +02:00
}
} while (!$rundone);
2020-10-13 08:21:26 +02:00
unlink($instsjfp);
unlink($statusjfp);
2020-10-14 08:37:41 +02:00
unlink($lockfp);
eecho(1,'done :-)'.N);
unlink($logfp);
exit(0);
2020-10-13 08:21:26 +02:00
// functions
2020-10-13 08:21:26 +02:00
function writestatus(&$statusjfp,&$opts,&$instk,&$eta,&$done,&$procs) {
$f=@fopen($statusjfp,'w');
if ($f===false) mexit('could not open «'.$statusjfp.'» for writing.'.N,2,true);
fwrite($f,$opts['poolsize']."\t".$instk."\t".$eta."\t".$done.N);
foreach ($procs as $proc)
if (!is_null($proc))
fwrite($f,$proc['instk'].N);
fclose($f);
}
function eecho($lev,$msg) {
global $logf, $opts, $msglevs;
$time=microtime(false);
$time=explode(' ',$time);
$time=date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2);
$msg=$time.' '.$msglevs[$lev].': '.$msg;
if ($lev>=$opts['tuiminmsglev']) {
if ($lev<2)
echo($msg);
else
fwrite(STDERR,$msg);
}
if ($lev>=$opts['logminmsglev'] && isset($logf) && $logf!==false) fwrite($logf,$msg);
}
function myq(&$link,$query,$line) {
try {
$res=mysqli_query($link,$query);
}
catch (Exception $error) {
mexit('query «'.$query.'» (line '.$line.') failed: '.$error->getMessage().N,3,true);
}
// for older php versions < 8, which seem to not catch mysql exceptions
if ($res===false) mexit('query «'.$query.'» (line '.$line.') failed: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,3,true);
return($res);
}
function mexit($msg,$code,$remlock) {
global $link, $logf, $lockfp;
if (isset($link) && $link!==false) mysqli_close($link);
if ($remlock && isset($lockfp) && is_file($lockfp)) unlink($lockfp);
if ($code!=0)
eecho(3,$msg);
else
eecho(1,$msg);
if (isset($logf) && $logf!==false) fclose($logf);
exit($code);
}
2020-10-13 08:21:26 +02:00
?>