crawler.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. #!/usr/bin/php
  2. <?php
  3. /*
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>.
  14. */
  15. define('N',"\n");
  16. define('SNAME',basename(__FILE__));
  17. define('FNAME',preg_replace('/\.[^.]*$/','',SNAME));
  18. define('CHILD','getinstinfo.php');
  19. define('LIBDP','/../lib');
  20. require __DIR__.LIBDP.'/ght.php';
  21. require __DIR__.LIBDP.'/grace.php';
  22. require __DIR__.LIBDP.'/parsetime.php';
  23. use function mysqli_real_escape_string as myesc;
  24. declare(ticks=1);
  25. if (function_exists('pcntl_signal')) {
  26. function signalHandler($signal) {
  27. //echo(N);
  28. mexit('received signal «'.$signal.'», shutting down.'.N,0,true);
  29. }
  30. pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
  31. pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
  32. pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
  33. }
  34. $msglevs=['Debug', 'Info', 'Warning', 'Error', 'None'];
  35. $opts=[
  36. 'gracetime'=>$gracetime,
  37. 'poolsize'=>10,
  38. 'peersfp'=>null,
  39. 'dontrestore'=>false,
  40. 'ignorelock'=>false,
  41. 'minmsgimplev'=>1
  42. ];
  43. $help='SYNOPSIS
  44. '.SNAME.' [options]
  45. DESCRIPTION
  46. This script coordinates the parallel execution of a definable number of
  47. '.CHILD.' processes “against” all the alive instances which are already
  48. present in mastostart’s database, plus optionally those listed in a
  49. specifiable file (typically the output file from a peerscrawl.php run).
  50. OPTIONS
  51. -
  52. Everything after a single dash will be passed to '.CHILD.' processes as is.
  53. -g, --gracetime <time>
  54. If an instance has not been responding for longer than this time, avoid
  55. checking it. See section «TIME SPECIFICATION» below to see how to specify
  56. time.
  57. DEFAULT: '.ght($opts['gracetime'],null,0).'
  58. -G, --graceline
  59. Return the “graceline” (0:0:0 of today minus gracetime: see option above) in
  60. unix time and local time, then exit.
  61. -p, --peersfp <file>
  62. Defines the path to a file containing a list of instances to consider in
  63. addition to those which are already present in the database. Note that this
  64. option is ignored if the script will restore a previous unfinished session.
  65. -P, --poolsize <number>
  66. The number of slots in the processes pool, that is the number of '.CHILD.'
  67. processes the script will run in parallel. Note that this option is ignored
  68. if the script will restore a previous unfinished session.
  69. DEFAULT: '.$opts['poolsize'].'
  70. -I, --ignorelock
  71. Normally, if its lockfile exists, the script will exit with an error.
  72. If this option is set, instead, the lockfile existence will be ignored.
  73. Please check that the script is actually not running before using it.
  74. -R, --dontrestore
  75. If this option is set and «instances.job» and «status.job» files from
  76. a previous unfinished session are present in the «run» subdirectory inside
  77. the directory where the script resides, the script will ignore them and
  78. start a new session; otherwise the script will restore the previous,
  79. unfinished session.
  80. -m, --minmsgimplev <«debug»|«info»|«warning»|«error»|«none»>
  81. Defines the minimum “importance level” of messages to be written to the
  82. text user interface. There are 4 “importance levels”, in this order of
  83. importance: «debug», «info», «warning», «error».
  84. Setting this option to any of these values will write to the text user
  85. interface all the messages with the specified or a greater level; setting
  86. it to the special value «none» will completely disable messages.
  87. DEFAULT: '.lcfirst($msglevs[$opts['minmsgimplev']]).'
  88. -h, --help
  89. When this option is specified, the script will show this help text and exit.
  90. TIME SPECIFICATION
  91. An example is better than ~5148 words :-)
  92. To specify 1 year, 6 months (made of 31 days), 2 weeks, 3 days, 5 hours,
  93. 7 minutes and 12 seconds you can use «1y,6M,2w,3d,5h,7m,12s»; but you can
  94. also use «12s,7m,5h,3d,2w,6M,1y», or even «18M,1w,1w,2d,1d,3h,2h,7m,12s».
  95. LICENSE
  96. This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
  97. This is free software, and you are welcome to redistribute it under certain
  98. conditions; see <http://www.gnu.org/licenses/> for details.'.N;
  99. $childopts='';
  100. for ($i=1; $i<$argc; $i++) {
  101. if ($argv[$i]=='-') {
  102. if ($i<$argc-1) {
  103. $i++;
  104. while ($i<$argc) {
  105. $childopts.=' '.$argv[$i];
  106. $i++;
  107. }
  108. } else {
  109. eecho(2,'you have specified «-» as last argument...'.N);
  110. }
  111. } elseif ($argv[$i]=='-g' || $argv[$i]=='--gracetime') {
  112. if ($i+1>=$argc || ($time=parsetime($argv[$i+1]))===false)
  113. mexit('option «'.$argv[$i].'» requires a valid time specification as an argument (use «-h» to read help).'.N,1,false);
  114. $i++;
  115. $opts['gracetime']=$time;
  116. } elseif ($argv[$i]=='-G' || $argv[$i]=='--graceline') {
  117. echo 'Graceline: '.$graceline.' ('.date('Y-m-d H:i:s',$graceline).').'.N;
  118. exit(0);
  119. } elseif ($argv[$i]=='-p' || $argv[$i]=='--peersfp') {
  120. if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
  121. mexit('option «'.$argv[$i].'» requires an existing and readable file as an argument (use «-h» to read help).'.N,1,false);
  122. $i++;
  123. $opts['peersfp']=$argv[$i];
  124. } elseif ($argv[$i]=='-P' || $argv[$i]=='--poolsize') {
  125. if ($i+1>=$argc || preg_match('/\d+/',$argv[$i+1])!==1 || $argv[$i+1]+0<1)
  126. mexit('option «'.$argv[$i].'» requires an integer number greater than 0 as an argument (use «-h» to read help).'.N,1,false);
  127. $i++;
  128. $opts['poolsize']=$argv[$i]+0;
  129. } elseif ($argv[$i]=='-R' || $argv[$i]=='--dontrestore') {
  130. $opts['dontrestore']=true;
  131. } elseif ($argv[$i]=='-I' || $argv[$i]=='--ignorelock') {
  132. $opts['ignorelock']=true;
  133. } elseif ($argv[$i]=='-m' || $argv[$i]=='--minmsgimplev') {
  134. if ($i+1>=$argc || !in_array(ucfirst(strtolower($argv[$i+1])),$msglevs))
  135. mexit('option «'.$argv[$i].'» requires a “message importance level” value as an argument (use «-h» to read help).'.N,1,false);
  136. $i++;
  137. $opts['minmsimpglev']=array_search(ucfirst(strtolower($argv[$i])),$msglevs);
  138. } elseif ($argv[$i]=='-h' || $argv[$i]=='--help') {
  139. echo($help);
  140. exit(0);
  141. } else {
  142. mexit('don’t know how to interpret «'.$argv[$i].'» (you can read the help text using «-h» or «--help»).'.N,1,false);
  143. }
  144. }
  145. $rundirpath=__DIR__.'/run';
  146. $lockfp=$rundirpath.'/'.FNAME.'.lock';
  147. if (file_exists($lockfp) && !$opts['ignorelock']) {
  148. eecho(3,'lock file «'.$lockfp.'» exists (if you are sure '.SNAME.' is not already running you can use option «-I» to force execution).'.N);
  149. exit(1);
  150. }
  151. if (@touch($lockfp)===false) {
  152. eecho(3,'could not touch file «'.$lockfp.'».'.N);
  153. exit(1);
  154. }
  155. if (file_exists($rundirpath) && !is_dir($rundirpath))
  156. mexit('«'.$rundirpath.'» is not a directory.'.N,1,false);
  157. elseif (file_exists($rundirpath) && (!is_readable($rundirpath) || !is_writeable($rundirpath)))
  158. mexit('«'.$rundirpath.'» is not readable and writeable.'.N,1,false);
  159. elseif (!file_exists($rundirpath))
  160. if (@mkdir($rundirpath)===false)
  161. mexit('could not create directory «'.$rundirpath.'».'.N,1,false);
  162. $instsjfp=$rundirpath.'/'.FNAME.'_instances.job';
  163. $statusjfp=$rundirpath.'/'.FNAME.'_status.job';
  164. (!$opts['dontrestore'] && file_exists($instsjfp) && file_exists($statusjfp)) ? $restore=true : $restore=false;
  165. ($restore) ? eecho(1,'--- restarting ---'.N) : eecho(1,'--- starting ---'.N);
  166. if ($restore) {
  167. eecho(0,'looks like previous session was interrupted, trying to restore it...'.N);
  168. $insts=@file($instsjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
  169. if ($insts===false) mexit('could not open file «'.$instsjfp.'» for reading.'.N,1,true);
  170. $cinsts=count($insts);
  171. eecho(1,'loaded '.$cinsts.' hostnames from previous session file.'.N);
  172. $buf=@file($statusjfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
  173. if ($buf===false) mexit('could not open file «'.$statusjfp.'» for reading.'.N,1,true);
  174. if (count($buf)<2) mexit('file «'.$statusjfp.'»: wrong format (1).'.N,1,true);
  175. $buf[0]=explode("\t",$buf[0]);
  176. if (count($buf[0])!=4 ||
  177. preg_match('/^\d+$/',$buf[0][0])!==1 ||
  178. preg_match('/^\d+$/',$buf[0][1])!==1 ||
  179. preg_match('/^\d+(\.\d+)?$/',$buf[0][2])!==1 ||
  180. preg_match('/^\d+$/',$buf[0][3])!==1)
  181. mexit('file «'.$statusjfp.'»: wrong format (2).'.N,1,true);
  182. $opts['poolsize']=$buf[0][0]+0;
  183. $instk=$buf[0][1]+0;
  184. $toff=$buf[0][2]+0;
  185. $done=$buf[0][3]+0;
  186. //eecho(0,'poolsize: '.$opts['poolsize'].'; instk: '.$instk.'; eta: '.$tet.'; done: '.$done.'.'.N);
  187. for ($i=1; $i<count($buf); $i++) {
  188. if (preg_match('/^\d+$/',$buf[$i])!==1) mexit('file «'.$statusjfp.'»: wrong format (3).'.N,1,true);
  189. //eecho(0,$i.': '.$buf[$i].'.'.N);
  190. $host=$insts[$buf[$i]+0];
  191. eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
  192. $descspecs=[ 0=>['pipe','r'], 1=>['file',$rundirpath.'/'.$host.'.stdout.log','w'], 2=>['file',$rundirpath.'/'.$host.'.stderr.log','w'] ];
  193. $procs[]=['proc'=>proc_open(cmd($childopts,$host),$descspecs,$pipes[]), 'instk'=>$buf[$i]+0, 'host'=>$host, 'begts'=>microtime(true)];
  194. }
  195. eecho(1,'restored previous session.'.N);
  196. } else {
  197. $inifp=__DIR__.'/../conf/mustard.ini';
  198. $iniarr=@parse_ini_file($inifp);
  199. if ($iniarr===false) mexit('could not open config file «'.$inifp.'»'.N,1,true);
  200. try { $link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket']); }
  201. catch (Exception $error) { mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true); }
  202. // for php versions < 8
  203. if ($link===false) mexit('could not connect to MySQL server: '.mysqli_connect_error().'.'.N,1,true);
  204. try { $res=mysqli_set_charset($link,'utf8mb4'); }
  205. catch (Exception $error) { mexit('could not set «utf8mb4» charset for MySQL: '.mysqli_error($link).'.'.N,1,true); }
  206. // for php versions < 8
  207. if ($res===false) mexit('could not set MySQL charset: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,1,true);
  208. $insts=[];
  209. eecho(0,'loading instances from the database...'.N);
  210. $res=myq($link,'SELECT URI FROM Instances WHERE LastOkCheckTS IS NOT NULL AND LastOkCheckTS>='.$graceline,__LINE__);
  211. while($row=mysqli_fetch_assoc($res))
  212. if (!in_array($row['URI'],$insts))
  213. $insts[]=$row['URI'];
  214. eecho(1,'loaded '.count($insts).' instances which responded at least once since '.date('Y-m-d H:i:s',$graceline).' from the database.'.N);
  215. if (!is_null($opts['peersfp'])) {
  216. eecho(0,'loading “dead” instances from the database...'.N);
  217. $res=myq($link,'SELECT URI FROM Instances WHERE LastOkCheckTS IS NULL OR LastOkCheckTS<'.$graceline,__LINE__);
  218. $deadinsts=[];
  219. while($row=mysqli_fetch_assoc($res))
  220. if (!in_array($row['URI'],$deadinsts))
  221. $deadinsts[]=$row['URI'];
  222. eecho(0,'loaded '.count($deadinsts).' “dead” instances from the database.'.N);
  223. eecho(0,'loading instances from «'.$opts['peersfp'].'»...'.N);
  224. $peers=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
  225. if ($peers===false) mexit('could not open «'.$opts['peersfp'].'» for reading.'.N,1,true);
  226. $i=0;
  227. foreach ($peers as $pdom) {
  228. if (!in_array($pdom,$insts)) {
  229. if (!in_array($pdom,$deadinsts)) {
  230. $i++;
  231. $insts[]=$pdom;
  232. } else {
  233. eecho(0,'ignoring instance «'.$pdom.'» from peers file because it’s dead.'.N);
  234. }
  235. }
  236. }
  237. eecho(1,'loaded '.$i.' more instances from «'.$opts['peersfp'].'».'.N);
  238. unset($deadinsts);
  239. }
  240. mysqli_close($link);
  241. unset($link);
  242. shuffle($insts);
  243. $cinsts=count($insts);
  244. eecho(1,$cinsts.' instances to be checked.'.N);
  245. $instsf=@fopen($instsjfp,'w');
  246. if ($instsf===false) mexit('could not open «'.$instsjfp.'» for writing.'.N,1,true);
  247. foreach ($insts as $host) fwrite($instsf,$host.N);
  248. fclose($instsf);
  249. $toff=0;
  250. $done=0;
  251. $procs=[];
  252. for ($instk=0; $instk<$opts['poolsize'] && $instk<$cinsts; $instk++) {
  253. $host=$insts[$instk];
  254. eecho(1,'bootstrapping processes pool, adding host «'.$host.'».'.N);
  255. $descspecs=[ 0=>['pipe','r'], 1=>['file',$rundirpath.'/'.$host.'.stdout.log','w'], 2=>['file',$rundirpath.'/'.$host.'.stderr.log','w'] ];
  256. $procs[]=['proc'=>proc_open(cmd($childopts,$host),$descspecs,$pipes[]), 'instk'=>$instk, 'host'=>$host, 'begts'=>microtime(true)];
  257. }
  258. $instk--;
  259. }
  260. $tini=microtime(true);
  261. $rundone=false;
  262. do {
  263. $now=microtime(true);
  264. $tet=$now-$tini+$toff;
  265. eecho(0,'[[[ CHECKING PROCESSES POOL ]]]'.N);
  266. $somerun=false;
  267. foreach ($procs as $key=>$proc) {
  268. if (!is_null($proc) && is_resource($proc['proc'])) {
  269. $pstat=proc_get_status($proc['proc']);
  270. if (!$pstat['running']) {
  271. fclose($pipes[$key][0]);
  272. $rv=proc_close($procs[$key]['proc']);// this always returns -1, it seems it's a php bug, anyway i'm keeping it for now to stay on the safe side (?)
  273. $done++;
  274. $out='proc slot '.$key.': finished running on «'.$proc['host'].'» after '.ght($now-$proc['begts'],null,0).' (exit code: '.$pstat['exitcode'].')';
  275. if ($instk<$cinsts-1) {
  276. $instk++;
  277. $host=$insts[$instk];
  278. $descspecs=[ 0=>['pipe','r'], 1=>['file',$rundirpath.'/'.$host.'.stdout.log','w'], 2=>['file',$rundirpath.'/'.$host.'.stderr.log','w'] ];
  279. $procs[$key]=['proc'=>proc_open(cmd($childopts,$host),$descspecs,$pipes[$key]), 'instk'=>$instk, 'host'=>$host, 'begts'=>$now];
  280. $out.='; started a new process on «'.$host.'».'.N;
  281. } else {
  282. $out.='; no more hosts to check.'.N;
  283. $procs[$key]=null;
  284. }
  285. eecho(1,$out);
  286. } else {
  287. eecho(0,'proc slot '.$key.': been running on «'.$proc['host'].'» for '.ght($now-$proc['begts'],null,0).'.'.N);
  288. $somerun=true;
  289. }
  290. }
  291. }
  292. $out=$done.'/'.$cinsts.' ('.round(100/$cinsts*$done).'%); elapsed time: '.ght($tet,null,0);
  293. if ($done>0) $out.='; estimated time remaining: '.ght($cinsts*$tet/$done-$tet,null,0);
  294. eecho(1,$out.'.'.N);
  295. if ($somerun) {
  296. writestatus($statusjfp,$opts,$instk,$tet,$done,$procs);
  297. sleep(1);
  298. } else {
  299. $rundone=true;
  300. }
  301. } while (!$rundone);
  302. unlink($instsjfp);
  303. unlink($statusjfp);
  304. unlink($lockfp);
  305. eecho(1,'done :-)'.N);
  306. exit(0);
  307. // functions
  308. function writestatus(&$statusjfp,&$opts,&$instk,&$tet,&$done,&$procs) {
  309. $f=@fopen($statusjfp,'w');
  310. if ($f===false) mexit('could not open «'.$statusjfp.'» for writing.'.N,2,true);
  311. fwrite($f,$opts['poolsize']."\t".$instk."\t".$tet."\t".$done.N);
  312. foreach ($procs as $proc)
  313. if (!is_null($proc))
  314. fwrite($f,$proc['instk'].N);
  315. fclose($f);
  316. }
  317. function cmd(&$childopts, &$host) {
  318. return('exec '.__DIR__.'/'.CHILD.$childopts.' '.escapeshellarg($host));
  319. }
  320. function eecho($lev,$msg) {
  321. global $opts, $msglevs;
  322. $time=microtime(false);
  323. $time=explode(' ',$time);
  324. $time=date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2);
  325. $msg=$time.' '.$msglevs[$lev].': '.$msg;
  326. if ($lev>=$opts['minmsgimplev']) {
  327. if ($lev<2)
  328. echo($msg);
  329. else
  330. fwrite(STDERR,$msg);
  331. }
  332. }
  333. function myq(&$link,$query,$line) {
  334. try {
  335. $res=mysqli_query($link,$query);
  336. }
  337. catch (Exception $error) {
  338. mexit('query «'.$query.'» (line '.$line.') failed: '.$error->getMessage().N,3,true);
  339. }
  340. // for older php versions < 8, which seem to not catch mysql exceptions
  341. if ($res===false) mexit('query «'.$query.'» (line '.$line.') failed: '.mysqli_errno($link).': '.mysqli_error($link).'.'.N,3,true);
  342. return($res);
  343. }
  344. function mexit($msg,$code,$remlock) {
  345. global $link, $lockfp;
  346. if (isset($link) && $link!==false) mysqli_close($link);
  347. if ($remlock && isset($lockfp) && is_file($lockfp)) unlink($lockfp);
  348. if ($code!=0)
  349. eecho(3,$msg);
  350. else
  351. eecho(1,$msg);
  352. exit($code);
  353. }
  354. ?>