2020-10-13 08:21:26 +02:00
|
|
|
|
#!/usr/bin/php
|
|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
*/
|
|
|
|
|
|
2022-12-01 05:44:06 +01:00
|
|
|
|
const N="\n";
|
2020-10-13 08:21:26 +02:00
|
|
|
|
|
2022-12-16 21:59:26 +01:00
|
|
|
|
require(__DIR__.'/../site/mustard/include/gurl.php');
|
2020-10-13 08:21:26 +02:00
|
|
|
|
|
|
|
|
|
setlocale(LC_ALL,getenv('LANG'));
|
|
|
|
|
|
|
|
|
|
$opts=array(
|
2020-10-21 15:26:31 +02:00
|
|
|
|
'inifp'=>__DIR__.'/../conf/mustard.ini',
|
2020-10-13 08:21:26 +02:00
|
|
|
|
'startinst'=>'mastodon.social',
|
|
|
|
|
'peersfp'=>__DIR__.'/peers',
|
|
|
|
|
'allpeersfp'=>__DIR__.'/peers.all',
|
|
|
|
|
'restore'=>false,
|
|
|
|
|
'excludefp'=>null,
|
|
|
|
|
'timeout'=>5,
|
|
|
|
|
'verbose'=>false,
|
|
|
|
|
'excludedead'=>false,
|
2020-10-14 08:37:41 +02:00
|
|
|
|
'timezone'=>date_default_timezone_get(),
|
|
|
|
|
'ignorelock'=>false
|
2020-10-13 08:21:26 +02:00
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$help='peerscrawl.php
|
|
|
|
|
DESCRIPTION
|
|
|
|
|
This program tries to build a fairly complete list of fediverse instances
|
|
|
|
|
exposing the [instance]/api/v1/instance/peers endpoint.
|
|
|
|
|
SYNOPSIS
|
|
|
|
|
peerscrawl.php [options]
|
|
|
|
|
OPTIONS
|
|
|
|
|
-s, --startinst <domain>
|
|
|
|
|
Defines the first instance to crawl.
|
|
|
|
|
DEFAULT: «'.$opts['startinst'].'»
|
|
|
|
|
-p, --peersfp <file>
|
|
|
|
|
Defines the file into which the ordered list of responding instances
|
|
|
|
|
will be saved.
|
|
|
|
|
DEFAULT: «'.$opts['peersfp'].'»
|
|
|
|
|
-a, --allpeersfp <file>
|
|
|
|
|
Defines the file into which the ordered list of all checked instances
|
|
|
|
|
will be saved.
|
|
|
|
|
DEFAULT: «'.$opts['allpeersfp'].'»
|
2020-10-14 08:37:41 +02:00
|
|
|
|
-I, --ignorelock
|
|
|
|
|
Normally, if its lockfile exists, the program exits with an error before
|
|
|
|
|
doing anything. With this option the lockfile is ignored. Please verify
|
|
|
|
|
that the program is not already running before using it.
|
2020-10-13 08:21:26 +02:00
|
|
|
|
-r, --restore
|
|
|
|
|
If peers file already exists on program’s start it will be loaded into
|
|
|
|
|
memory and each instance it contains will be considered “already
|
|
|
|
|
crawled”, thus allowing to “restore an interrupted crawling session”.
|
|
|
|
|
-e, --excludefp <file>
|
|
|
|
|
Defines a file containing exclusion rules: one regular expression per
|
|
|
|
|
line (empty lines are ignored). Any instance matching any defined regex
|
|
|
|
|
will be ignored by the program. Changes made to this file during program
|
|
|
|
|
execution will be taken into account.
|
|
|
|
|
-t, --timeout <seconds>
|
|
|
|
|
Defines the timeout in seconds for every connection attempt.
|
|
|
|
|
DEFAULT: «'.$opts['timeout'].'»
|
|
|
|
|
-T, --timezone <timezone identifier>
|
|
|
|
|
Defines the timezone for displaying localized values for dates and times.
|
|
|
|
|
DEFAULT on this system: «'.$opts['timezone'].'»
|
|
|
|
|
Note: if you want localized format as well set LANG environment variable.
|
|
|
|
|
-L, --tzlist
|
|
|
|
|
List all valid timezones and exit.
|
2022-12-02 06:04:25 +01:00
|
|
|
|
-E, --excludedead
|
|
|
|
|
Exclude instances marked as "Dead" in the database.
|
2020-10-13 08:21:26 +02:00
|
|
|
|
-v, --verbose
|
|
|
|
|
Be more verbose.
|
2022-12-02 06:04:25 +01:00
|
|
|
|
-h, --help
|
|
|
|
|
Show this help text and exit.
|
2020-10-13 08:21:26 +02:00
|
|
|
|
|
|
|
|
|
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
|
|
|
|
|
This is free software, and you are welcome to redistribute it under
|
|
|
|
|
certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
|
|
|
|
|
|
|
|
|
|
for ($i=1; $i<$argc; $i++) {
|
|
|
|
|
if (substr($argv[$i],0,1)=='-') {
|
|
|
|
|
switch($argv[$i]) {
|
|
|
|
|
case '-s':
|
|
|
|
|
case '--startinst':
|
|
|
|
|
if ($i+1>=$argc)
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['startinst']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-p':
|
|
|
|
|
case '--peersfp':
|
|
|
|
|
if ($i+1>=$argc)
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['peersfp']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-a':
|
|
|
|
|
case '--allpeersfp':
|
|
|
|
|
if ($i+1>=$argc)
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['allpeersfp']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-r':
|
|
|
|
|
case '--restore':
|
|
|
|
|
$opts['restore']=true;
|
|
|
|
|
break;
|
2020-10-14 08:37:41 +02:00
|
|
|
|
case '-I':
|
|
|
|
|
case '--ignorelock':
|
|
|
|
|
$opts['ignorelock']=true;
|
|
|
|
|
break;
|
2020-10-13 08:21:26 +02:00
|
|
|
|
case '-e':
|
|
|
|
|
case '--excludefp':
|
|
|
|
|
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable file’s path (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['excludefp']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-t':
|
|
|
|
|
case '--timeout':
|
|
|
|
|
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a number of seconds (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['timeout']=$argv[$i]+0;
|
|
|
|
|
break;
|
|
|
|
|
case '-T':
|
|
|
|
|
case '--timezone':
|
|
|
|
|
if ($i+1>=$argc || !@date_default_timezone_set($argv[$i+1]))
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a valid timezone identifier (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['timezone']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-L':
|
|
|
|
|
case '--tzlist':
|
|
|
|
|
$buf=timezone_identifiers_list();
|
|
|
|
|
foreach ($buf as $val)
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho($val.N,false,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
exit(0);
|
|
|
|
|
break;
|
2022-12-02 06:04:25 +01:00
|
|
|
|
case '-E':
|
|
|
|
|
case '--excludedead':
|
|
|
|
|
$opts['excludedead']=true;
|
|
|
|
|
break;
|
2020-10-13 08:21:26 +02:00
|
|
|
|
case '-v':
|
|
|
|
|
case '--verbose':
|
|
|
|
|
$opts['verbose']=true;
|
|
|
|
|
break;
|
|
|
|
|
case '-h':
|
|
|
|
|
case '--help':
|
|
|
|
|
mexit($help,0);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-14 08:37:41 +02:00
|
|
|
|
$lockfp=__DIR__.'/peerscrawl.lock';
|
2022-12-17 18:43:13 +01:00
|
|
|
|
if (is_file($lockfp) && !$opts['ignorelock']) {
|
|
|
|
|
echo('Lockfile exists: it seems the program is already running; if you’re sure it’s not true, use «-I» to force execution.'.N);
|
|
|
|
|
exit(2);
|
|
|
|
|
}
|
2020-10-14 08:37:41 +02:00
|
|
|
|
touch($lockfp);
|
|
|
|
|
|
2020-10-13 08:21:26 +02:00
|
|
|
|
function mexit($msg,$code) {
|
2020-10-14 08:37:41 +02:00
|
|
|
|
global $link, $lockfp;
|
2022-12-17 18:43:13 +01:00
|
|
|
|
if (isset($link) && $link!==false) mysqli_close($link);
|
|
|
|
|
if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp);
|
2022-07-13 12:45:57 +02:00
|
|
|
|
if ($code==0)
|
|
|
|
|
echo($msg);
|
|
|
|
|
else
|
|
|
|
|
fwrite(STDERR,$msg);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
exit($code);
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-13 12:45:57 +02:00
|
|
|
|
function gecho($msg,$prtime,$iserr) {
|
|
|
|
|
if ($prtime)
|
|
|
|
|
$msg=microdate().' '.$msg;
|
|
|
|
|
if ($iserr)
|
|
|
|
|
fwrite(STDERR,$msg);
|
|
|
|
|
else
|
|
|
|
|
echo($msg);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function microdate($time=null) {
|
|
|
|
|
if (is_null($time)) $time=microtime(false);
|
|
|
|
|
$time=explode(' ',$time);
|
|
|
|
|
return(date('Y-m-d H:i:s',$time[1]).'.'.substr($time[0],2));
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-13 08:21:26 +02:00
|
|
|
|
function sortcheckandsave(&$arr,$arrdesc,&$fp) {
|
|
|
|
|
$buc=count($arr);
|
|
|
|
|
$arr=array_unique($arr);
|
|
|
|
|
$auc=count($arr);
|
2022-07-13 12:45:57 +02:00
|
|
|
|
if ($buc!=$auc) gecho('WARNING: '.$arrdesc.' contained '.($buc-$auc).' duplicates, better check my code ;-)'.N,true,true);
|
|
|
|
|
gecho('Saving ordered '.$arrdesc.' into «'.$fp.'».'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
sort($arr);
|
|
|
|
|
$f=@fopen($fp,'w');
|
|
|
|
|
if ($f!==false) {
|
|
|
|
|
foreach ($arr as $val)
|
|
|
|
|
fwrite($f,$val.N);
|
|
|
|
|
fclose($f);
|
|
|
|
|
} else {
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('ERROR: couldn’t open «'.$fp.'» for writing.'.N,true,true);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function shutdown($dosort) {
|
2020-10-14 08:37:41 +02:00
|
|
|
|
global $opts, $peersf, $allpeersf, $insts, $ainsts, $lockfp;
|
2022-12-17 18:43:13 +01:00
|
|
|
|
if (isset($peersf) && $peersf!==false) @fclose($peersf);
|
|
|
|
|
if (isset($allpeersf) && $allpeersf!==false) @fclose($allpeersf);
|
2020-10-14 08:37:41 +02:00
|
|
|
|
if (isset($lockfp) && file_exists($lockfp)) unlink($lockfp);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
if ($dosort) {
|
|
|
|
|
sortcheckandsave($insts,'list of responding instances',$opts['peersfp']);
|
|
|
|
|
sortcheckandsave($ainsts,'list of all checked instances',$opts['allpeersfp']);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ATTENZIONE: se lo script viene pipato, tipo "script.php | tee script.log",
|
|
|
|
|
// la funzione viene eseguita, anche se l'output della stessa non viene mostrato
|
|
|
|
|
function signalHandler($signal) {
|
|
|
|
|
echo(N.'I got interrupted (signal: '.$signal.').'.N);
|
|
|
|
|
shutdown(false);
|
|
|
|
|
//touch('KILLED');
|
2020-10-14 08:37:41 +02:00
|
|
|
|
exit(3);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
//declare(ticks=1);
|
|
|
|
|
pcntl_async_signals(true);
|
|
|
|
|
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
|
|
|
|
|
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
|
|
|
|
|
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
|
|
|
|
|
|
|
|
|
|
$deadinsts=array();
|
|
|
|
|
if ($opts['excludedead']) {
|
|
|
|
|
$iniarr=@parse_ini_file($opts['inifp'])
|
|
|
|
|
or mexit('ERROR: I couldn’t open «'.$opts['inifp'].'».'.N,2);
|
|
|
|
|
$link=@mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket'])
|
|
|
|
|
or mexit('ERROR: I couldn’t connect to MySQL server: '.mysqli_connect_error().N,2);
|
|
|
|
|
mysqli_set_charset($link,'utf8mb4')
|
|
|
|
|
or mexit('ERROR trying to set MySQL client charset: '.__LINE__.': '.mysqli_error($link).N,2);
|
|
|
|
|
$res=mysqli_query($link,'SELECT URI FROM Instances WHERE Dead=1')
|
|
|
|
|
or mexit('ERROR: '.__LINE__.': '.mysqli_error($link).N,2);
|
|
|
|
|
mysqli_close($link);
|
|
|
|
|
while ($row=mysqli_fetch_assoc($res))
|
|
|
|
|
$deadinsts[]=$row['URI'];
|
|
|
|
|
unset($res);
|
2022-12-08 00:07:31 +01:00
|
|
|
|
gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$insts=array();
|
|
|
|
|
$ainsts=array();
|
|
|
|
|
$exarr=array();
|
|
|
|
|
|
|
|
|
|
if ($opts['restore']) {
|
|
|
|
|
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('Loading «'.$opts['peersfp'].'».'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
|
|
|
|
|
} else {
|
|
|
|
|
mexit('WARNING: I couldn’t open «'.$opts['peersfp'].'» for reading.'.N,2);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$peersf=@fopen($opts['peersfp'],'w');
|
|
|
|
|
if (!$peersf) mexit('I could not open «'.$opts['peersfp'].'» in write mode.'.N,2);
|
|
|
|
|
$allpeersf=@fopen($opts['allpeersfp'],'w');
|
|
|
|
|
if (!$allpeersf) mexit('I could not open «'.$opts['allpeersfp'].'» in write mode.'.N,2);
|
|
|
|
|
|
|
|
|
|
function isempty($val) {
|
|
|
|
|
if (preg_match('/^\s*$/',$val)===1)
|
|
|
|
|
return(true);
|
|
|
|
|
else
|
|
|
|
|
return(false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function waituntilonline() {
|
|
|
|
|
$url='www.google.com';
|
|
|
|
|
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('WARNING: it seems we are offline :-('.N,true,true);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
sleep(5);
|
|
|
|
|
}
|
|
|
|
|
fclose($f);
|
2022-07-13 12:45:57 +02:00
|
|
|
|
// gecho('It seems we are online! :-)'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function updexarr() {
|
|
|
|
|
global $exarr, $opts;
|
|
|
|
|
if (!is_null($opts['excludefp'])) {
|
|
|
|
|
$f=@fopen($opts['excludefp'],'r');
|
|
|
|
|
if ($f!==false) {
|
|
|
|
|
$i=0;
|
|
|
|
|
$exarr=array();
|
|
|
|
|
while (!feof($f)) {
|
|
|
|
|
$i++;
|
|
|
|
|
$line=trim(fgets($f));
|
|
|
|
|
if (!isempty($line)) {
|
|
|
|
|
if (@preg_match($line,'foo')!==false)
|
|
|
|
|
$exarr[]=$line;
|
|
|
|
|
else
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N,true,true);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N,true,true);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function ckexarr($inst) {
|
|
|
|
|
global $exarr;
|
|
|
|
|
foreach ($exarr as $re)
|
|
|
|
|
if (preg_match($re,$inst)===1) return(true);
|
|
|
|
|
return(false);
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-21 15:26:31 +02:00
|
|
|
|
require(__DIR__.'/../site/mustard/include/ghs.php');
|
|
|
|
|
require(__DIR__.'/../site/mustard/include/ght.php');
|
2020-10-13 08:21:26 +02:00
|
|
|
|
|
2022-11-11 21:57:30 +01:00
|
|
|
|
function ismultibyte($s) {
|
|
|
|
|
preg_replace('/./u','.',$s,-1,$c);
|
|
|
|
|
(strlen($s)>$c) ? $r=true : $r=false;
|
|
|
|
|
return($r);
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-13 08:21:26 +02:00
|
|
|
|
function validhostname($url) {
|
2022-12-18 11:42:32 +01:00
|
|
|
|
//$hostname=preg_replace('#/.*#','',$url);
|
|
|
|
|
//$hostname=preg_replace('#:[0-9]+$#','',$hostname);
|
2022-11-11 21:57:30 +01:00
|
|
|
|
if (ismultibyte($hostname)) $hostname=idn_to_ascii($hostname);
|
|
|
|
|
//echo($hostname.N);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
if (strlen($hostname)>253) return(false);
|
|
|
|
|
$labels=explode('.',$hostname);
|
|
|
|
|
foreach($labels as $label) {
|
|
|
|
|
$len=strlen($label);
|
|
|
|
|
if ($len<1 || $len>63) return(false);
|
|
|
|
|
if (preg_match('#^-#',$label)==1) return(false);
|
|
|
|
|
if (preg_match('#-$#',$label)==1) return(false);
|
|
|
|
|
//if (preg_match('#--#',$label)==1) return(false);
|
|
|
|
|
if (preg_match('#^[a-zA-Z0-9-]+$#',$label)!==1) return(false);
|
|
|
|
|
}
|
|
|
|
|
return(true);
|
|
|
|
|
}
|
|
|
|
|
//$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die();
|
|
|
|
|
|
|
|
|
|
function crawl($list,$id) {
|
|
|
|
|
global $ainsts, $insts, $deadinsts, $peersf, $allpeersf, $opts, $tini;
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('~~~~~~~ START OF ROUND '.$id.' ~~~~~~~'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
waituntilonline();
|
|
|
|
|
updexarr();
|
|
|
|
|
foreach ($list as $inst) {
|
|
|
|
|
if (!in_array($inst,$ainsts)) {
|
|
|
|
|
$ainsts[]=$inst;
|
|
|
|
|
fwrite($allpeersf,$inst.N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$nlist=array();
|
|
|
|
|
$c=count($list);
|
|
|
|
|
$i=0;
|
|
|
|
|
$rtini=time();
|
|
|
|
|
foreach ($list as $inst) {
|
|
|
|
|
$i++;
|
|
|
|
|
$now=time();
|
|
|
|
|
$rtela=$now-$rtini;
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('>>> '.$inst.N,true,false);
|
|
|
|
|
gecho('@@@ Round '.$id.', '.$i.'/'.$c.': TET: '.ght($now-$tini,null,0).'; ETR of this round: '.ght($rtela/$i*$c-$rtela,null,0).'; using '.ghs(memory_get_usage(true)).' mem. (peak: '.ghs(memory_get_peak_usage(true)).'); '.count($insts).' responding insts; '.count($nlist).' insts in next round list; '.count($ainsts).' total.'.N,true,false);
|
2022-11-11 21:57:30 +01:00
|
|
|
|
gecho('Trying to load «'.$inst.'»’s peers...'.N,true,false);
|
2022-12-16 21:59:26 +01:00
|
|
|
|
$peers=gurl('https://'.$inst.'/api/v1/instance/peers',$opts['timeout']);
|
2022-11-11 21:57:30 +01:00
|
|
|
|
if ($peers['cont']===false) {
|
|
|
|
|
gecho('ERROR: '.$peers['emsg'].N,true,true);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
} else {
|
2022-11-11 21:57:30 +01:00
|
|
|
|
$peers=@json_decode($peers['cont'],true);
|
|
|
|
|
if (!is_array($peers)) {
|
|
|
|
|
gecho('ERROR: $peers is not an array (its type is '.gettype($peers).').'.N,true,true);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
} else {
|
2022-11-11 21:57:30 +01:00
|
|
|
|
gecho('LOADED!'.N,true,false);
|
|
|
|
|
if (in_array($inst,$insts)) {
|
|
|
|
|
gecho('NOTICE: «'.$inst.'» is not a new instance (it was already in $insts).'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
} else {
|
2022-11-11 21:57:30 +01:00
|
|
|
|
gecho('NEW INSTANCE FOUND: «'.$inst.'».'.N,true,false);
|
|
|
|
|
$insts[]=$inst;
|
|
|
|
|
fwrite($peersf,$inst.N);
|
|
|
|
|
}
|
|
|
|
|
foreach ($peers as $peer) {
|
|
|
|
|
if (!is_string($peer)) {
|
|
|
|
|
gecho(' ERROR: I won’t add this peer to next round list because its name is not a string.'.N,true,true);
|
|
|
|
|
} elseif (!validhostname($peer)) {
|
|
|
|
|
gecho(' ERROR: I won’t add «'.$peer.'» to next round list because it’s not a valid hostname.'.N,true,true);
|
|
|
|
|
} elseif (ckexarr($peer)) {
|
2022-12-08 00:07:31 +01:00
|
|
|
|
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,false);
|
2022-11-11 21:57:30 +01:00
|
|
|
|
} elseif (in_array($peer,$ainsts)) {
|
|
|
|
|
if ($opts['verbose'])
|
|
|
|
|
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $ainsts.'.N,true,false);
|
|
|
|
|
} elseif (in_array($peer,$nlist)) {
|
|
|
|
|
if ($opts['verbose'])
|
|
|
|
|
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $nlist.'.N,true,false);
|
|
|
|
|
// questo qui sotto diventa ridondante ora che uso $ainsts e lo popolo a inizio funzione
|
|
|
|
|
/*} elseif (in_array($peer,$list)) {
|
|
|
|
|
if ($opts['verbose'])
|
|
|
|
|
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $list.'.N,true,false);
|
|
|
|
|
// questo qui sotto è sempre stato ridondante
|
|
|
|
|
} elseif (in_array($peer,$insts)) {
|
|
|
|
|
if ($opts['verbose'])
|
|
|
|
|
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $insts.'.N,true,false);
|
|
|
|
|
}*/
|
|
|
|
|
} elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) {
|
2022-12-08 00:07:31 +01:00
|
|
|
|
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s DEAD.'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
} else {
|
2022-11-11 21:57:30 +01:00
|
|
|
|
//EVVIVA!
|
|
|
|
|
gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false);
|
|
|
|
|
$nlist[]=$peer;
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (count($nlist)>0) {
|
|
|
|
|
unset($list);
|
|
|
|
|
crawl($nlist,$id+1);
|
|
|
|
|
} else {
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('Next round list is empty.'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('~~~~~~~ END OF ROUND '.$id.' ~~~~~~~'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$tini=time();
|
|
|
|
|
crawl(array($opts['startinst']),1);
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('DONE CRAWLING! :-)'.N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
shutdown(true);
|
|
|
|
|
$now=time();
|
2022-07-13 12:45:57 +02:00
|
|
|
|
gecho('Crawl started on '.date('Y-m-d H:i:s',$tini).' and ended on '.date('Y-m-d H:i:s',$now).'.'.N,true,false);
|
|
|
|
|
gecho(count($ainsts).' URIs checked in '.ght($now-$tini).'; '.count($insts).' responded. Max memory usage: '.ghs(memory_get_peak_usage(true)).N,true,false);
|
2020-10-13 08:21:26 +02:00
|
|
|
|
exit(0);
|
|
|
|
|
|
|
|
|
|
?>
|