2020-05-08 18:03:11 +02:00
|
|
|
|
#!/usr/bin/php
|
2020-05-02 19:59:53 +02:00
|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
setlocale(LC_ALL,getenv('LANG'));
|
|
|
|
|
|
|
|
|
|
define('N',"\n");
|
|
|
|
|
|
|
|
|
|
$opts=array(
|
|
|
|
|
'startinst'=>'mastodon.social',
|
|
|
|
|
'peersfp'=>'peers',
|
|
|
|
|
'restore'=>false,
|
|
|
|
|
'excludefp'=>null
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$help='peerscrawl.php
|
|
|
|
|
DESCRIPTION
|
|
|
|
|
This program tries to build a fairly complete list of mastodon instances.
|
|
|
|
|
SYNOPSIS
|
|
|
|
|
peerscrawl.php [options]
|
|
|
|
|
OPTIONS
|
|
|
|
|
-s, --startinst <domain>
|
|
|
|
|
Defines the first instance to crawl.
|
|
|
|
|
DEFAULT: «'.$opts['startinst'].'»
|
|
|
|
|
-p, --peersfp <file>
|
|
|
|
|
Defines the file into which the ordered list of instances will be saved.
|
|
|
|
|
DEFAULT: «'.$opts['peersfp'].'»
|
|
|
|
|
-r, --restore
|
|
|
|
|
If peers file already exists on program’s start it will be loaded into
|
|
|
|
|
memory and each instance it contains will be considered “already
|
|
|
|
|
crawled”, thus allowing to “restore an interrupted crawling session”.
|
|
|
|
|
-e, --excludefp <file>
|
|
|
|
|
Defines a file containing exclusion rules: one regular expression per
|
|
|
|
|
line (empty lines are ignored). Any instance matching any defined regex
|
|
|
|
|
will be ignored by the program. Changes made to this file during program
|
|
|
|
|
execution will be taken into account.
|
|
|
|
|
|
|
|
|
|
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
|
|
|
|
|
This is free software, and you are welcome to redistribute it under
|
|
|
|
|
certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
|
|
|
|
|
|
|
|
|
|
for ($i=1; $i<$argc; $i++) {
|
|
|
|
|
if (substr($argv[$i],0,1)=='-') {
|
|
|
|
|
switch($argv[$i]) {
|
|
|
|
|
case '-s':
|
|
|
|
|
case '--startinst':
|
|
|
|
|
if ($i+1>=$argc)
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['startinst']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-p':
|
|
|
|
|
case '--peersfp':
|
|
|
|
|
if ($i+1>=$argc)
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['peersfp']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-r':
|
|
|
|
|
case '--restore':
|
|
|
|
|
$opts['restore']=true;
|
|
|
|
|
$i++;
|
|
|
|
|
break;
|
|
|
|
|
case '-e':
|
|
|
|
|
case '--excludefp':
|
|
|
|
|
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
|
|
|
|
|
mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable file’s path (use «-h» for more info).'.N,1);
|
|
|
|
|
$i++;
|
|
|
|
|
$opts['excludefp']=$argv[$i];
|
|
|
|
|
break;
|
|
|
|
|
case '-h':
|
|
|
|
|
case '--help':
|
|
|
|
|
mexit($help,0);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function mexit($msg,$code) {
|
|
|
|
|
echo($msg);
|
|
|
|
|
exit($code);
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-09 00:02:10 +02:00
|
|
|
|
function shutdown($dosort) {
|
2020-05-02 19:59:53 +02:00
|
|
|
|
global $opts, $peersf, $insts;
|
|
|
|
|
if ($peersf) @fclose($peersf);
|
2020-05-09 00:02:10 +02:00
|
|
|
|
if ($dosort) {
|
|
|
|
|
echo('Saving ordered instances list into «'.$opts['peersfp'].'».'.N);
|
|
|
|
|
sort($insts);
|
|
|
|
|
$peersf=@fopen($opts['peersfp'],'w');
|
|
|
|
|
if ($peersf!==false) {
|
|
|
|
|
foreach ($insts as $inst)
|
|
|
|
|
fwrite($peersf,$inst.N);
|
|
|
|
|
fclose($peersf);
|
|
|
|
|
} else {
|
|
|
|
|
echo('Couldn’t open «'.$opts['peersfp'].'» for writing.'.N);
|
|
|
|
|
}
|
2020-05-02 19:59:53 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
declare(ticks=1);
|
|
|
|
|
function signalHandler($signal) {
|
|
|
|
|
echo(N.'I got interrupted (signal: '.$signal.').'.N);
|
2020-05-09 00:02:10 +02:00
|
|
|
|
shutdown(false);
|
2020-05-02 19:59:53 +02:00
|
|
|
|
exit(2);
|
|
|
|
|
}
|
|
|
|
|
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
|
|
|
|
|
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
|
|
|
|
|
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
|
|
|
|
|
|
|
|
|
|
$contextopts=array(
|
|
|
|
|
'http'=>array(
|
|
|
|
|
'timeout'=>3
|
|
|
|
|
),
|
|
|
|
|
'socket'=>array(
|
|
|
|
|
'tcp_nodelay'=>true
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
$context=stream_context_create($contextopts);
|
|
|
|
|
|
|
|
|
|
$insts=array();
|
|
|
|
|
|
|
|
|
|
$exarr=array();
|
|
|
|
|
|
|
|
|
|
if ($opts['restore']) {
|
|
|
|
|
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
|
|
|
|
|
echo('Loading «'.$opts['peersfp'].'».'.N);
|
|
|
|
|
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
|
|
|
|
|
} else {
|
|
|
|
|
echo('WARNING: I couldn’t open «'.$opts['peersfp'].'» for reading.'.N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$peersf=@fopen($opts['peersfp'],'w');
|
|
|
|
|
|
|
|
|
|
function isempty($val) {
|
|
|
|
|
if (preg_match('/^\s*$/',$val)===1)
|
|
|
|
|
return(true);
|
|
|
|
|
else
|
|
|
|
|
return(false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function waituntilonline() {
|
|
|
|
|
global $context;
|
|
|
|
|
$url='www.google.com';
|
|
|
|
|
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
|
|
|
|
|
echo(strftime('%c').' - WARNING: it seems we are offline :-('.N);
|
|
|
|
|
sleep(5);
|
|
|
|
|
}
|
|
|
|
|
fclose($f);
|
|
|
|
|
// echo(strftime('%c').' - it seems we are online! :-)'.N);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function updexarr() {
|
|
|
|
|
global $exarr, $opts;
|
|
|
|
|
if (!is_null($opts['excludefp'])) {
|
|
|
|
|
$f=@fopen($opts['excludefp'],'r');
|
|
|
|
|
if ($f!==false) {
|
|
|
|
|
$i=0;
|
|
|
|
|
$exarr=array();
|
|
|
|
|
while (!feof($f)) {
|
|
|
|
|
$i++;
|
|
|
|
|
$line=trim(fgets($f));
|
|
|
|
|
if (!isempty($line)) {
|
|
|
|
|
if (@preg_match($line,'foo')!==false)
|
|
|
|
|
$exarr[]=$line;
|
|
|
|
|
else
|
|
|
|
|
echo('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
echo('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function ckexarr($inst) {
|
|
|
|
|
global $exarr;
|
|
|
|
|
foreach ($exarr as $re)
|
|
|
|
|
if (preg_match($re,$inst)===1) return(true);
|
|
|
|
|
return(false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function crawl($inst) {
|
|
|
|
|
global $insts, $peersf, $context;
|
|
|
|
|
waituntilonline();
|
|
|
|
|
updexarr();
|
|
|
|
|
if (!isempty($inst)) {
|
|
|
|
|
if (!in_array($inst,$insts)) {
|
|
|
|
|
echo('«'.$inst.'» is not a known instance, I add it to the list of known instances.'.N);
|
|
|
|
|
$insts[]=$inst;
|
|
|
|
|
fwrite($peersf,$inst.N);
|
|
|
|
|
}
|
|
|
|
|
echo('«'.$inst.'»: trying to load instance’s peers ... ');
|
|
|
|
|
$peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context);
|
|
|
|
|
if ($peers!=false) {
|
|
|
|
|
echo('OK :-)'.N);
|
|
|
|
|
$peers=json_decode($peers,true);
|
|
|
|
|
if (is_array($peers)) {
|
|
|
|
|
foreach ($peers as $peer) {
|
|
|
|
|
if (is_string($peer)) {
|
|
|
|
|
if (!ckexarr($peer)) {
|
|
|
|
|
if (!in_array($peer,$insts)) {
|
|
|
|
|
echo('>>> I will crawl «'.$peer.'».'.N);
|
|
|
|
|
crawl($peer);
|
|
|
|
|
}/* else {
|
|
|
|
|
echo('>>> I won’t crawl «'.$peer.'» because I already did.'.N);
|
|
|
|
|
}*/
|
|
|
|
|
} else {
|
|
|
|
|
echo('>>> I won’t crawl «'.$peer.'» because its name matches with an exclusion regex.'.N);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
echo('>>> I won’t crawl this peer because its name is not a string.'.N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
echo('ERROR :-('.N);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
echo('I WON’T add nameless instances.'.N);
|
|
|
|
|
}
|
|
|
|
|
echo('~~~~~~~ Stats: '.count($insts).' known istances ~~~~~~~'.N);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
crawl($opts['startinst']);
|
|
|
|
|
echo('DONE CRAWLING! :-)'.N);
|
2020-05-09 00:02:10 +02:00
|
|
|
|
shutdown(true);
|
2020-05-02 19:59:53 +02:00
|
|
|
|
exit(0);
|
|
|
|
|
|
|
|
|
|
?>
|