123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- #!/usr/bin/php
- <?php
- /*
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- setlocale(LC_ALL,getenv('LANG'));
- define('N',"\n");
- $opts=array(
- 'startinst'=>'mastodon.social',
- 'peersfp'=>__DIR__.'/peers',
- 'restore'=>false,
- 'excludefp'=>null
- );
- $help='peerscrawl.php
- DESCRIPTION
- This program tries to build a fairly complete list of mastodon instances.
- SYNOPSIS
- peerscrawl.php [options]
- OPTIONS
- -s, --startinst <domain>
- Defines the first instance to crawl.
- DEFAULT: «'.$opts['startinst'].'»
- -p, --peersfp <file>
- Defines the file into which the ordered list of instances will be saved.
- DEFAULT: «'.$opts['peersfp'].'»
- -r, --restore
- If peers file already exists on program’s start it will be loaded into
- memory and each instance it contains will be considered “already
- crawled”, thus allowing to “restore an interrupted crawling session”.
- -e, --excludefp <file>
- Defines a file containing exclusion rules: one regular expression per
- line (empty lines are ignored). Any instance matching any defined regex
- will be ignored by the program. Changes made to this file during program
- execution will be taken into account.
- This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
- This is free software, and you are welcome to redistribute it under
- certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
- for ($i=1; $i<$argc; $i++) {
- if (substr($argv[$i],0,1)=='-') {
- switch($argv[$i]) {
- case '-s':
- case '--startinst':
- if ($i+1>=$argc)
- mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
- $i++;
- $opts['startinst']=$argv[$i];
- break;
- case '-p':
- case '--peersfp':
- if ($i+1>=$argc)
- mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1);
- $i++;
- $opts['peersfp']=$argv[$i];
- break;
- case '-r':
- case '--restore':
- $opts['restore']=true;
- $i++;
- break;
- case '-e':
- case '--excludefp':
- if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
- mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable file’s path (use «-h» for more info).'.N,1);
- $i++;
- $opts['excludefp']=$argv[$i];
- break;
- case '-h':
- case '--help':
- mexit($help,0);
- break;
- default:
- mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
- break;
- }
- }
- }
- function mexit($msg,$code) {
- echo($msg);
- exit($code);
- }
- function shutdown($dosort) {
- global $opts, $peersf, $insts;
- if ($peersf) @fclose($peersf);
- if ($dosort) {
- echo('Saving ordered instances list into «'.$opts['peersfp'].'».'.N);
- sort($insts);
- $peersf=@fopen($opts['peersfp'],'w');
- if ($peersf!==false) {
- foreach ($insts as $inst)
- fwrite($peersf,$inst.N);
- fclose($peersf);
- } else {
- echo('Couldn’t open «'.$opts['peersfp'].'» for writing.'.N);
- }
- }
- }
- declare(ticks=1);
- function signalHandler($signal) {
- echo(N.'I got interrupted (signal: '.$signal.').'.N);
- shutdown(false);
- exit(2);
- }
- pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
- pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
- pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
- $contextopts=array(
- 'http'=>array(
- 'timeout'=>3
- ),
- 'socket'=>array(
- 'tcp_nodelay'=>true
- )
- );
- $context=stream_context_create($contextopts);
- $insts=array();
- $exarr=array();
- if ($opts['restore']) {
- if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
- echo('Loading «'.$opts['peersfp'].'».'.N);
- $insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
- } else {
- echo('WARNING: I couldn’t open «'.$opts['peersfp'].'» for reading.'.N);
- }
- }
- $peersf=@fopen($opts['peersfp'],'w');
- function isempty($val) {
- if (preg_match('/^\s*$/',$val)===1)
- return(true);
- else
- return(false);
- }
- function waituntilonline() {
- global $context;
- $url='www.google.com';
- while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
- echo(strftime('%c').' - WARNING: it seems we are offline :-('.N);
- sleep(5);
- }
- fclose($f);
- // echo(strftime('%c').' - it seems we are online! :-)'.N);
- }
- function updexarr() {
- global $exarr, $opts;
- if (!is_null($opts['excludefp'])) {
- $f=@fopen($opts['excludefp'],'r');
- if ($f!==false) {
- $i=0;
- $exarr=array();
- while (!feof($f)) {
- $i++;
- $line=trim(fgets($f));
- if (!isempty($line)) {
- if (@preg_match($line,'foo')!==false)
- $exarr[]=$line;
- else
- echo('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N);
- }
- }
- } else {
- echo('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N);
- }
- }
- }
- function ckexarr($inst) {
- global $exarr;
- foreach ($exarr as $re)
- if (preg_match($re,$inst)===1) return(true);
- return(false);
- }
- function crawl($inst) {
- global $insts, $peersf, $context;
- waituntilonline();
- updexarr();
- if (!isempty($inst)) {
- // questo qui sotto è ridondante su un lancio normale, ma serve per evitare che l'istanza di partenza finisca nella lista 2 volte quando lo si lancia con "--restore"
- if (!in_array($inst,$insts)) {
- echo('«'.$inst.'» is not a known instance, I add it to the list of known instances.'.N);
- $insts[]=$inst;
- fwrite($peersf,$inst.N);
- }
- echo('«'.$inst.'»: trying to load instance’s peers ... ');
- $peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context);
- if ($peers!=false) {
- echo('OK :-)'.N);
- $peers=json_decode($peers,true);
- if (is_array($peers)) {
- foreach ($peers as $peer) {
- if (is_string($peer)) {
- if (!ckexarr($peer)) {
- if (!in_array($peer,$insts)) {
- echo('>>> I will crawl «'.$peer.'».'.N);
- crawl($peer);
- }/* else {
- echo('>>> I won’t crawl «'.$peer.'» because I already did.'.N);
- }*/
- } else {
- echo('>>> I won’t crawl «'.$peer.'» because its name matches with an exclusion regex.'.N);
- }
- } else {
- echo('>>> I won’t crawl this peer because its name is not a string.'.N);
- }
- }
- }
- } else {
- echo('ERROR :-('.N);
- }
- } else {
- echo('I WON’T add nameless instances.'.N);
- }
- echo('~~~~~~~ Stats: '.count($insts).' known istances ~~~~~~~'.N);
- }
- crawl($opts['startinst']);
- echo('DONE CRAWLING! :-)'.N);
- shutdown(true);
- exit(0);
- ?>
|