MastodonStartpage/web/mustard/crawler/peerscrawl.php
pezcurrel a1d618f99a ...
2020-05-08 18:03:11 +02:00

245 lines
6.7 KiB
PHP
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
setlocale(LC_ALL,getenv('LANG'));
define('N',"\n");
$opts=array(
'startinst'=>'mastodon.social',
'peersfp'=>'peers',
'restore'=>false,
'excludefp'=>null
);
$help='peerscrawl.php
DESCRIPTION
This program tries to build a fairly complete list of mastodon instances.
SYNOPSIS
peerscrawl.php [options]
OPTIONS
-s, --startinst <domain>
Defines the first instance to crawl.
DEFAULT: «'.$opts['startinst'].
-p, --peersfp <file>
Defines the file into which the ordered list of instances will be saved.
DEFAULT: «'.$opts['peersfp'].
-r, --restore
If peers file already exists on programs start it will be loaded into
memory and each instance it contains will be considered “already
crawled”, thus allowing to “restore an interrupted crawling session”.
-e, --excludefp <file>
Defines a file containing exclusion rules: one regular expression per
line (empty lines are ignored). Any instance matching any defined regex
will be ignored by the program. Changes made to this file during program
execution will be taken into account.
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under
certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
for ($i=1; $i<$argc; $i++) {
if (substr($argv[$i],0,1)=='-') {
switch($argv[$i]) {
case '-s':
case '--startinst':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
$i++;
$opts['startinst']=$argv[$i];
break;
case '-p':
case '--peersfp':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-r':
case '--restore':
$opts['restore']=true;
$i++;
break;
case '-e':
case '--excludefp':
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable files path (use «-h» for more info).'.N,1);
$i++;
$opts['excludefp']=$argv[$i];
break;
case '-h':
case '--help':
mexit($help,0);
break;
default:
mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
break;
}
}
}
function mexit($msg,$code) {
echo($msg);
exit($code);
}
function shutdown() {
global $opts, $peersf, $insts;
if ($peersf) @fclose($peersf);
echo('Saving ordered instances list into «'.$opts['peersfp'].'».'.N);
sort($insts);
$peersf=@fopen($opts['peersfp'],'w');
if ($peersf!==false) {
foreach ($insts as $inst)
fwrite($peersf,$inst.N);
fclose($peersf);
} else {
echo('Couldnt open «'.$opts['peersfp'].'» for writing.'.N);
}
}
declare(ticks=1);
function signalHandler($signal) {
echo(N.'I got interrupted (signal: '.$signal.').'.N);
shutdown();
exit(2);
}
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
$contextopts=array(
'http'=>array(
'timeout'=>3
),
'socket'=>array(
'tcp_nodelay'=>true
)
);
$context=stream_context_create($contextopts);
$insts=array();
$exarr=array();
if ($opts['restore']) {
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
echo('Loading «'.$opts['peersfp'].'».'.N);
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
} else {
echo('WARNING: I couldnt open «'.$opts['peersfp'].'» for reading.'.N);
}
}
$peersf=@fopen($opts['peersfp'],'w');
function isempty($val) {
if (preg_match('/^\s*$/',$val)===1)
return(true);
else
return(false);
}
function waituntilonline() {
global $context;
$url='www.google.com';
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
echo(strftime('%c').' - WARNING: it seems we are offline :-('.N);
sleep(5);
}
fclose($f);
// echo(strftime('%c').' - it seems we are online! :-)'.N);
}
function updexarr() {
global $exarr, $opts;
if (!is_null($opts['excludefp'])) {
$f=@fopen($opts['excludefp'],'r');
if ($f!==false) {
$i=0;
$exarr=array();
while (!feof($f)) {
$i++;
$line=trim(fgets($f));
if (!isempty($line)) {
if (@preg_match($line,'foo')!==false)
$exarr[]=$line;
else
echo('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N);
}
}
} else {
echo('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N);
}
}
}
function ckexarr($inst) {
global $exarr;
foreach ($exarr as $re)
if (preg_match($re,$inst)===1) return(true);
return(false);
}
function crawl($inst) {
global $insts, $peersf, $context;
waituntilonline();
updexarr();
if (!isempty($inst)) {
if (!in_array($inst,$insts)) {
echo('«'.$inst.'» is not a known instance, I add it to the list of known instances.'.N);
$insts[]=$inst;
fwrite($peersf,$inst.N);
}
echo('«'.$inst.'»: trying to load instances peers ... ');
$peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context);
if ($peers!=false) {
echo('OK :-)'.N);
$peers=json_decode($peers,true);
if (is_array($peers)) {
foreach ($peers as $peer) {
if (is_string($peer)) {
if (!ckexarr($peer)) {
if (!in_array($peer,$insts)) {
echo('>>> I will crawl «'.$peer.'».'.N);
crawl($peer);
}/* else {
echo('>>> I wont crawl «'.$peer.'» because I already did.'.N);
}*/
} else {
echo('>>> I wont crawl «'.$peer.'» because its name matches with an exclusion regex.'.N);
}
} else {
echo('>>> I wont crawl this peer because its name is not a string.'.N);
}
}
}
} else {
echo('ERROR :-('.N);
}
} else {
echo('I WONT add nameless instances.'.N);
}
echo('~~~~~~~ Stats: '.count($insts).' known istances ~~~~~~~'.N);
}
crawl($opts['startinst']);
echo('DONE CRAWLING! :-)'.N);
shutdown();
exit(0);
?>