Crawler new version, “multithreaded”, coordinator script, first commit

This commit is contained in:
pezcurrel 2022-12-16 00:00:06 +01:00
parent 1430cd80fb
commit 1cafbe05ea

View file

@ -0,0 +1,427 @@
#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('N',"\n");
define('SNAME',basename(__FILE__));
define('FNAME',preg_replace('/\.[^.]*$/','',SNAME));
define('CHILD','getinstinfo.php');
define('LIBDP','/../../site/mustard/include');
require(__DIR__.LIBDP.'/ght.php');
require(__DIR__.LIBDP.'/parsetime.php');
use function mysqli_real_escape_string as myesc;
declare(ticks=1);
if (function_exists('pcntl_signal')) {
function signalHandler($signal) {
echo(N);
mexit('received signal «'.$signal.'», shutting down.'.N,0);
}
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
}
$opts=[
'poolsize'=>20,
'moreclauses'=>'',
'peersfp'=>null,
'dontrestore'=>false,
'ignorelock'=>false,
'timeout'=>10,
'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds, declare it dead
'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount, it's no longer new
'ldtoots'=>40,// number of toots to check with the automatic language detection function
'fetchusers'=>false,
'setnew'=>true,
'dryrun'=>false,
'jsonfp'=>__DIR__.'/instances.json',
'jsonwrite'=>false,
];
$help='SYNOPSIS
'.SNAME.' [options]
DESCRIPTION
This script coordinates the parallel execution of a definable number of
'.CHILD.' processes “against” all the alive instances which are already
present in mastostarts database, plus optionally those listed in a
specifiable file (typically the output file from a peerscrawl.php run).
OPTIONS
-p, --peersfp <file>
Defines the path to a file containing a list of instances to consider in
addition to those which are already present in the database. Note that this
option is ignored if the script will recover a previous unfinished session.
-P, --poolsize <number>
The number of slots in the processes pool, that is the number of '.CHILD.'
processes the script will run in parallel. Note that this option is ignored
if the script will recover a previous unfinished session.
DEFAULT: '.$opts['poolsize'].'
-D, --deadline <time specification>
If an instance has not been responding for more than this time, declare
it dead. See section «TIME SPECIFICATION» below to see how to specify time.
This option gets passed to each '.CHILD.' process as is, and has no effect
on '.SNAME.' itself.
DEFAULT: '.ght($opts['deadline'],[' day§ days',' hour§ hours',' minute§ minutes',' second§ seconds']).'
-o, --oldline <time specification>
If an instance has been marked as new for more than this time, mark it as
not new. See section «TIME SPECIFICATION» below to see how to specify time.
This option gets passed to each '.CHILD.' process as is, and has no effect
on '.SNAME.' itself.
DEFAULT: '.ght($opts['oldline'],[' day§ days',' hour§ hours',' minute§ minutes',' second§ seconds']).'
-l, --ldtoots <number>
This option defines the number of toots that '.CHILD.' processes will try
to fetch from the local public timelines to try and guess the most used
languages of each instance. This option gets passed to each '.CHILD.'
process as is, and has no effect on '.SNAME.' itself. Its minimum value is
10, its maximum is 40.
DEFAULT: '.$opts['ldtoots'].'
-f, --fetchusers
If this option is set, the '.CHILD.' processes will try to fetch users
profiles infos from each considered instances user directory and store
them in the database. This option gets passed to each '.CHILD.' process as
is, and has no effect on '.SNAME.' itself.
-t, --timeout <seconds>
Sets the timeout in seconds for every connection attempt. This option gets
passed to each '.CHILD.' process as is, and has no effect on '.SNAME.'
itself.
DEFAULT: '.$opts['timeout'].'
-N, --dontsetnew
If this option is set, '.CHILD.' processes wont mark new instances as
new. This can be useful for a first run. This option gets passed to each
'.CHILD.' process as is, and has no effect on '.SNAME.' itself.
-I, --ignorelock
Normally, if its lockfile exists, the script will exit with an error.
If this option is set, instead, the lockfile existence will be ignored.
Please check that the script is actually not running before using it.
-R, --dontrestore
If this option is set and «instances.job» and «status.job» files from
a previous unfinished session are present in the «run» subdirectory inside
the directory where the script resides, the script will ignore them and
start a new session; otherwise the script will restore the previous,
unfinished session.
-d, --dryrun
If this option is set, '.CHILD.' processes wont write anything in the
database. It is meant for testing purposes. This option gets passed to each
'.CHILD.' process as is, and has no effect on '.SNAME.' itself.
-m, --moreclauses <more SQL clauses>
If this option is set, whatever one writes as argument to the option will
be added to the main query for instances records, which is «SELECT URI FROM
Instances WHERE Dead=0».
-h, --help
When this option is specified, the script will show this help text and exit.
TIME SPECIFICATION
An example is better than ~5147 words :-)
To specify 1 year, 6 months (made of 31 days), 2 weeks, 3 days, 5 hours,
7 minutes and 12 seconds you can use «1y,6M,2w,3d,5h,7m,12; but you can
also use «12s,7m,5h,3d,2w,6M,1, or even «18M,1w,1w,2d,1d,3h,2h,7m,12.
LICENSE
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under certain
conditions; see <http://www.gnu.org/licenses/> for details.'.N;
for ($i=1; $i<$argc; $i++) {
if ($argv[$i]=='-p' || $argv[$i]=='--peersfp') {
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('option «'.$argv[$i].'» requires an existing and readable file as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
} elseif ($argv[$i]=='-P' || $argv[$i]=='--poolsize') {
if ($i+1>=$argc || preg_match('/\d+/',$argv[$i+1])!==1 || $argv[$i+1]+0<1)
mexit('option «'.$argv[$i].'» requires an integer number greater than 0 as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['poolsize']=$argv[$i]+0;
} elseif ($argv[$i]=='-t' || $argv[$i]=='--timeout') {
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
mexit('option «'.$argv[$i].'» requires a numeric argument (use «-h» to read help).'.N,1);
$i++;
$opts['timeout']=$argv[$i]+0;
} elseif ($argv[$i]=='-D' || $argv[$i]=='--deadline') {
if ($i+1>=$argc || parsetime($argv[$i+1])===false)
mexit('option «'.$argv[$i].'» requires a time specification as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['deadline']=parsetime($argv[$i]);
} elseif ($argv[$i]=='-o' || $argv[$i]=='--oldline') {
if ($i+1>=$argc || parsetime($argv[$i+1])===false)
mexit('option «'.$argv[$i].'» requires a time specification as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['oldline']=parsetime($argv[$i]);
} elseif ($argv[$i]=='-l' || $argv[$i]=='--ldtoots') {
if ($i+1>=$argc || preg_match('/^\d+$/',$argv[$i+1])!==1 || $argv[$i+1]+0>40 || $argv[$i+1]+0<10)
mexit('option «'.$argv[$i].'» requires a number >= 10 and <= 40 as an argument (use «-h» to read help).'.N,1);
$i++;
$opts['ldtoots']=$argv[$i]+0;
} elseif ($argv[$i]=='-f' || $argv[$i]=='--fetchusers') {
$opts['fetchusers']=true;
} elseif ($argv[$i]=='-N' || $argv[$i]=='--dontsetnew') {
$opts['setnew']=false;
} elseif ($argv[$i]=='-d' || $argv[$i]=='--dryrun') {
$opts['dryrun']=true;
} elseif ($argv[$i]=='-R' || $argv[$i]=='--dontrestore') {
$opts['dontrestore']=true;
} elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') {
$opts['ignorelock']=true;
} elseif ($argv[$i]=='-m' || $argv[$i]=='--moreclauses') {
if ($i+1>=$argc)
mexit('option «'.$argv[$i].'» requires some SQL clause as argument (use «-h» to read help).'.N,1);
$i++;
$opts['moreclauses']=$argv[$i];
} elseif ($argv[$i]=='-h' || $argv[$i]=='--help') {
echo($help);
exit(0);
} else {
mexit('dont know how to interpret «'.$argv[$i].'» (you can read the help text using «-h» or «--help»).'.N,1);
}
}
$rundirpath=__DIR__.'/run';
if (file_exists($rundirpath) && !is_dir($rundirpath))
mexit('«'.$rundirpath.'» is not a directory.'.N,1);
elseif (file_exists($rundirpath) && (!is_readable($rundirpath) || !is_writeable($rundirpath)))
mexit('«'.$rundirpath.'» is not readable and writeable.'.N,1);
elseif (!file_exists($rundirpath))
if (@mkdir($rundirpath)===false)
mexit('could not create directory «'.$rundirpath.'».'.N,1);
$lockfp=$rundirpath.'/'.FNAME.'lock';
if (file_exists($lockfp) && !$opts['ignorelock']) {
eecho(3,'lock file «'.$lockfp.'» exists (if you are sure '.SNAME.' is not already running you can use option «-I» to force execution).'.N);
exit(1);
}
if (touch($lockfp)===false) {
eecho(3,'could not touch file «'.$lockfp.'».'.N);
exit(1);
}
$inifp=__DIR__.'/../../conf/mustard.ini';
$iniarr=@parse_ini_file($inifp);
if ($iniarr===false) mexit('could not open config file «'.$inifp.'»'.N,1);
try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
catch (Exception $error) { mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1); }
try { mysqli_set_charset($link,'utf8mb4'); }
catch (Exception $error) { mexit('could not set «utf8mb4» charset for MySQL: '.mysqli_error($link).'.'.N,1); }
$cmd=__DIR__.'/getinstinfo.php -t '.$opts['timeout'].' -D '.$opts['deadline'].'s -o '.$opts['oldline'].'s -l '.$opts['ldtoots'];
if (!$opts['setnew']) $cmd.=' -N';
if ($opts['dryrun']) $cmd.=' -d';
if ($opts['fetchusers']) $cmd.=' -f';
eecho(1,'base command: «'.$cmd.'».'.N);
$instsjfp=$rundirpath.'/instances.job';
$statusjfp=$rundirpath.'/status.job';
$tini=microtime(true);
if (!$opts['dontrestore'] && file_exists($statusjfp) && file_exists($instsjfp)) {
eecho(0,'looks like previous session was interrupted, trying to recover it...'.N);
$insts=@file($instsjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($insts===false) mexit('could not open file «'.$instsjfp.'» for reading.'.N,1);
$cinsts=count($insts);
eecho(1,'loaded '.$cinsts.' hostnames from previous session file.'.N);
$buf=@file($statusjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($buf===false) mexit('could not open file «'.$statusjfp.'» for reading.'.N,1);
if (count($buf)<2) mexit('file «'.$statusjfp.'»: wrong format (1).'.N,1);
$buf[0]=explode("\t",$buf[0]);
if (count($buf[0])!=4 ||
preg_match('/^\d+$/',$buf[0][0])!==1 ||
preg_match('/^\d+$/',$buf[0][1])!==1 ||
preg_match('/^\d+[\d.]*\d$/',$buf[0][2])!==1 ||
preg_match('/^\d+$/',$buf[0][3])!==1)
mexit('file «'.$statusjfp.'»: wrong format (2).'.N,1);
$opts['poolsize']=$buf[0][0]+0;
$instk=$buf[0][1]+0;
$toff=$buf[0][2]+0;
$done=$buf[0][3]+0;
//eecho(0,'poolsize: '.$opts['poolsize'].'; instk: '.$instk.'; eta: '.$eta.'; done: '.$done.'.'.N);
for ($i=1; $i<count($buf); $i++) {
if (preg_match('/^\d+$/',$buf[$i])!==1) mexit('file «'.$statusjfp.'»: wrong format (3).'.N,1);
//eecho(0,$i.': '.$buf[$i].'.'.N);
$host=$insts[$buf[$i]+0];
eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
$procs[]=['proc'=>proc_open($cmd.' '.escapeshellarg($host).' &>/dev/null',[],$pipes[]), 'instk'=>$buf[$i]+0, 'host'=>$host, 'begts'=>microtime(true)];
}
eecho(1,'recovered previous session.'.N);
} else {
$insts=[];
eecho(0,'loading known, alive instances from the database...'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=0'.$opts['moreclauses'],__LINE__);
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$insts))
$insts[]=$row['URI'];
eecho(1,'loaded '.count($insts).' known, alive instances from the database.'.N);
if (!is_null($opts['peersfp'])) {
eecho(0,'loading dead instances from the database...'.N);
$res=myq($link,'SELECT URI FROM Instances WHERE Dead=1',__LINE__);
$deadinsts=[];
while($row=mysqli_fetch_assoc($res))
if (!in_array($row['URI'],$deadinsts))
$deadinsts[]=$row['URI'];
eecho(1,'loaded '.count($deadinsts).' dead instances from the database.'.N);
eecho(0,'loading instances from «'.$opts['peersfp'].'»...'.N);
$peers=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($peers===false) mexit('could not open «'.$opts['peersfp'].'» for reading.'.N,1);
$i=0;
foreach ($peers as $pdom) {
if (!in_array($pdom,$insts)) {
if (!in_array($pdom,$deadinsts)) {
if (!willtrunc($pdom,'Instances','URI')) {
$i++;
$insts[]=$pdom;
} else {
eecho(2,'ignoring instance «'.$pdom.'» from peers file because its hostname is too long for column «URI» of table «Instances».'.N);
}
} else {
eecho(1,'ignoring instance «'.$pdom.'» from peers file because it is dead.'.N);
}
}
}
eecho(1,'loaded '.$i.' instances from «'.$opts['peersfp'].'».'.N);
}
unset($deadinsts);
shuffle($insts);
$cinsts=count($insts);
eecho(1,$cinsts.' instances to be checked.'.N);
$instsf=@fopen($instsjfp,'w');
if ($instsf===false) mexit('could not open «'.$instsjfp.'» for writing.'.N,1);
foreach ($insts as $host) fwrite($instsf,$host.N);
fclose($instsf);
$toff=0;
$done=0;
$procs=[];
for ($instk=0; $instk<$opts['poolsize'] && $instk<$cinsts; $instk++) {
$host=$insts[$instk];
eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
$procs[]=['proc'=>proc_open($cmd.' '.escapeshellarg($host).' &>/dev/null',[],$pipes[]), 'instk'=>$instk, 'host'=>$host, 'begts'=>microtime(true)];
}
$instk--;
}
mysqli_close($link);
unset($link);
$rundone=false;
do {
$now=microtime(true);
$eta=$now-$tini+$toff;
eecho(1,'[[[ CHECKING PROCESSES POOL ]]]'.N);
$somerun=false;
foreach ($procs as $key=>$proc) {
if (!is_null($proc)) {
$pstat=proc_get_status($proc['proc']);
if (!$pstat['running']) {
$done++;
$out='proc slot '.$key.': finished running on «'.$proc['host'].'» (exit code: '.$pstat['exitcode'].')';
if ($instk<$cinsts-1) {
$instk++;
$host=$insts[$instk];
$procs[$key]=['proc'=>proc_open($cmd.' '.escapeshellarg($host).' &>/dev/null',[],$pipes[$key]), 'instk'=>$instk, 'host'=>$host, 'begts'=>$now];
$out.=', started a new process on «'.$host.'».'.N;
} else {
$out.='; no more hosts to check.'.N;
$procs[$key]=null;
}
eecho(1,$out);
} else {
eecho(1,'proc slot '.$key.': been running on «'.$proc['host'].'» for '.ght($now-$proc['begts']).'.'.N);
$somerun=true;
}
}
}
$out=$done.'/'.$cinsts.' ('.round(100/$cinsts*$done).'%); elapsed time: '.ght($eta);
if ($done>0) $out.='; estimated time remaining: '.ght($cinsts*$eta/$done-$eta);
eecho(1,$out.'.'.N);
if ($somerun) {
writestatus($statusjfp,$opts,$instk,$eta,$done,$procs);
sleep(1);
} else {
$rundone=true;
}
} while (!$rundone);
unlink($instsjfp);
unlink($statusjfp);
unlink($lockfp);
eecho(1,'Done :-)'.N);
exit(0);
// functions
function writestatus(&$statusjfp,&$opts,&$instk,&$eta,&$done,&$procs) {
$f=@fopen($statusjfp,'w');
if ($f===false) mexit('could not open «'.$statusjfp.'» for writing.'.N,2);
fwrite($f,$opts['poolsize']."\t".$instk."\t".$eta."\t".$done.N);
foreach ($procs as $proc)
if (!is_null($proc))
fwrite($f,$proc['instk'].N);
fclose($f);
}
function eecho($lev,$msg) {
$time=microtime(false);
$time=explode(' ',$time);
$time=date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2);
$levs=['Debug', 'Info', 'Warning', 'Error'];
$msg=$time.' '.$levs[$lev].': '.$msg;
if ($lev<2)
echo($msg);
else
fwrite(STDERR,$msg);
}
function mexit($msg,$code) {
global $link, $jsonf, $lockfp;
if (isset($link)) mysqli_close($link);
if (isset($jsonf)) fclose($jsonf);
if (isset($lockfp) && is_file($lockfp)) unlink($lockfp);
if ($code!=0)
eecho(3,$msg);
else
eecho(1,$msg);
exit($code);
}
function myq(&$link,$query,$line) {
try {
$res=mysqli_query($link,$query);
}
catch (Exception $error) {
mexit('query «'.$query.'» (line '.$line.') failed: '.$error->getMessage().N,3);
}
return($res);
}
?>