2020-10-13 08:21:26 +02:00
#!/usr/bin/php
< ? php
/*
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http :// www . gnu . org / licenses />.
*/
2022-12-21 22:07:05 +01:00
define ( 'N' , " \n " );
2020-10-13 08:21:26 +02:00
2022-12-16 21:59:26 +01:00
require ( __DIR__ . '/../site/mustard/include/gurl.php' );
2022-12-21 22:07:05 +01:00
require ( __DIR__ . '/../site/mustard/include/ghs.php' );
require ( __DIR__ . '/../site/mustard/include/ght.php' );
2020-10-13 08:21:26 +02:00
2022-12-21 22:07:05 +01:00
$opts = [
2020-10-21 15:26:31 +02:00
'inifp' => __DIR__ . '/../conf/mustard.ini' ,
2020-10-13 08:21:26 +02:00
'startinst' => 'mastodon.social' ,
'peersfp' => __DIR__ . '/peers' ,
2022-12-21 22:07:05 +01:00
'apeersfp' => __DIR__ . '/peers.all' ,
'cpeersfp' => __DIR__ . '/peers.checked' ,
2020-10-13 08:21:26 +02:00
'restore' => false ,
'excludefp' => null ,
'timeout' => 5 ,
2022-12-21 22:07:05 +01:00
'curltimeout' => 10 ,
2020-10-13 08:21:26 +02:00
'verbose' => false ,
'excludedead' => false ,
2020-10-14 08:37:41 +02:00
'ignorelock' => false
2022-12-21 22:07:05 +01:00
];
2020-10-13 08:21:26 +02:00
$help = ' peerscrawl . php
DESCRIPTION
This program tries to build a fairly complete list of fediverse instances
exposing the [ instance ] / api / v1 / instance / peers endpoint .
SYNOPSIS
peerscrawl . php [ options ]
OPTIONS
- s , -- startinst < domain >
Defines the first instance to crawl .
DEFAULT : « '.$opts[' startinst '].' »
- p , -- peersfp < file >
Defines the file into which the ordered list of responding instances
will be saved .
DEFAULT : « '.$opts[' peersfp '].' »
2022-12-21 22:07:05 +01:00
- a , -- apeersfp < file >
Defines the file into which the ordered list of all instances will
be saved .
DEFAULT : « '.$opts[' apeersfp '].' »
- c , -- cpeersfp < file >
Defines the file into which the ordered list of all checked instances will
be saved .
DEFAULT : « '.$opts[' cpeersfp '].' »
2020-10-14 08:37:41 +02:00
- I , -- ignorelock
Normally , if its lockfile exists , the program exits with an error before
doing anything . With this option the lockfile is ignored . Please verify
that the program is not already running before using it .
2020-10-13 08:21:26 +02:00
- r , -- restore
If peers file already exists on program’ s start it will be loaded into
memory and each instance it contains will be considered “already
crawled” , thus allowing to “restore an interrupted crawling session” .
- e , -- excludefp < file >
Defines a file containing exclusion rules : one regular expression per
line ( empty lines are ignored ) . Any instance matching any defined regex
will be ignored by the program . Changes made to this file during program
execution will be taken into account .
- t , -- timeout < seconds >
Defines the timeout in seconds for every connection attempt .
DEFAULT : « '.$opts[' timeout '].' »
2022-12-02 06:04:25 +01:00
- E , -- excludedead
Exclude instances marked as " Dead " in the database .
2020-10-13 08:21:26 +02:00
- v , -- verbose
Be more verbose .
2022-12-02 06:04:25 +01:00
- h , -- help
Show this help text and exit .
2020-10-13 08:21:26 +02:00
This program comes with ABSOLUTELY NO WARRANTY ; for details see the source .
This is free software , and you are welcome to redistribute it under
certain conditions ; see < http :// www . gnu . org / licenses /> for details . ' . N ;
for ( $i = 1 ; $i < $argc ; $i ++ ) {
if ( substr ( $argv [ $i ], 0 , 1 ) == '-' ) {
switch ( $argv [ $i ]) {
case '-s' :
case '--startinst' :
if ( $i + 1 >= $argc )
2022-12-21 22:07:05 +01:00
mexit ( 'Error: option «' . $argv [ $i ] . '» has to be followed by a domain name (use «-h» for more info).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$opts [ 'startinst' ] = $argv [ $i ];
break ;
case '-p' :
case '--peersfp' :
if ( $i + 1 >= $argc )
2022-12-21 22:07:05 +01:00
mexit ( 'Error: option «' . $argv [ $i ] . '» has to be followed by a file’ s path (use «-h» for more info).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$opts [ 'peersfp' ] = $argv [ $i ];
break ;
case '-a' :
2022-12-21 22:07:05 +01:00
case '--apeersfp' :
2020-10-13 08:21:26 +02:00
if ( $i + 1 >= $argc )
2022-12-21 22:07:05 +01:00
mexit ( 'Error: option «' . $argv [ $i ] . '» has to be followed by a file’ s path (use «-h» for more info).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
2022-12-21 22:07:05 +01:00
$opts [ 'apeersfp' ] = $argv [ $i ];
break ;
case '-c' :
case '--cpeersfp' :
if ( $i + 1 >= $argc )
mexit ( 'Error: option «' . $argv [ $i ] . '» has to be followed by a file’ s path (use «-h» for more info).' . N , 1 );
$i ++ ;
$opts [ 'cpeersfp' ] = $argv [ $i ];
2020-10-13 08:21:26 +02:00
break ;
case '-r' :
case '--restore' :
$opts [ 'restore' ] = true ;
break ;
2020-10-14 08:37:41 +02:00
case '-I' :
case '--ignorelock' :
$opts [ 'ignorelock' ] = true ;
break ;
2020-10-13 08:21:26 +02:00
case '-e' :
case '--excludefp' :
if ( $i + 1 >= $argc || ! file_exists ( $argv [ $i + 1 ]) || ! is_file ( $argv [ $i + 1 ]) || ! is_readable ( $argv [ $i + 1 ]))
2022-12-21 22:07:05 +01:00
mexit ( 'Error: option «' . $argv [ $i ] . '» has to be followed by an existing, readable file’ s path (use «-h» for more info).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$opts [ 'excludefp' ] = $argv [ $i ];
break ;
case '-t' :
case '--timeout' :
if ( $i + 1 >= $argc || preg_match ( '/^[0-9]+$/' , $argv [ $i + 1 ]) !== 1 )
2022-12-21 22:07:05 +01:00
mexit ( 'Error: option «' . $argv [ $i ] . '» has to be followed by a number of seconds (use «-h» for more info).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$opts [ 'timeout' ] = $argv [ $i ] + 0 ;
break ;
2022-12-02 06:04:25 +01:00
case '-E' :
case '--excludedead' :
$opts [ 'excludedead' ] = true ;
break ;
2020-10-13 08:21:26 +02:00
case '-v' :
case '--verbose' :
$opts [ 'verbose' ] = true ;
break ;
case '-h' :
case '--help' :
mexit ( $help , 0 );
break ;
default :
2022-12-21 22:07:05 +01:00
mexit ( 'Error: option «' . $argv [ $i ] . '» is unknown (use «-h» for more info).' . N , 1 );
2020-10-13 08:21:26 +02:00
break ;
}
}
}
2020-10-14 08:37:41 +02:00
$lockfp = __DIR__ . '/peerscrawl.lock' ;
2022-12-17 18:43:13 +01:00
if ( is_file ( $lockfp ) && ! $opts [ 'ignorelock' ]) {
echo ( 'Lockfile exists: it seems the program is already running; if you’ re sure it’ s not true, use «-I» to force execution.' . N );
exit ( 2 );
}
2020-10-14 08:37:41 +02:00
touch ( $lockfp );
2022-12-21 22:07:05 +01:00
//declare(ticks=1);
pcntl_async_signals ( true );
pcntl_signal ( SIGTERM , 'signalHandler' ); // Termination ('kill' was called)
pcntl_signal ( SIGHUP , 'signalHandler' ); // Terminal log-out
pcntl_signal ( SIGINT , 'signalHandler' ); // Interrupted (Ctrl-C is pressed)
$deadinsts = [];
if ( $opts [ 'excludedead' ]) {
$iniarr =@ parse_ini_file ( $opts [ 'inifp' ])
or mexit ( 'Error: couldn’ t open «' . $opts [ 'inifp' ] . '».' . N , 1 );
try { $link =@ mysqli_connect ( $iniarr [ 'db_host' ], $iniarr [ 'db_admin_name' ], $iniarr [ 'db_admin_password' ], $iniarr [ 'db_name' ], $iniarr [ 'db_port' ], $iniarr [ 'db_socket' ]); }
catch ( Exception $error ) { mexit ( 'Error: couldn’ t connect to MySQL server: ' . mysqli_connect_error () . '.' . N , 1 , true ); }
// for php versions < 8
if ( $link === false ) mexit ( 'Error: couldn’ t connect to MySQL server: ' . mysqli_connect_error () . '.' . N , 1 , true );
try { $res = mysqli_set_charset ( $link , 'utf8mb4' ); }
catch ( Exception $error ) { mexit ( 'Error: couldn’ t set «utf8mb4» charset for MySQL: ' . mysqli_error ( $link ) . ' (' . mysqli_errno ( $link ) . '.' . N , 1 , true ); }
// for php versions < 8
if ( $res === false ) mexit ( 'Error: couldn’ t set «utf8mb4» charset for MySQL: ' . mysqli_error ( $link ) . ' (' . mysqli_errno ( $link ) . ').' . N , 1 , true );
$res = myq ( $link , 'SELECT URI FROM Instances WHERE Dead=1' );
mysqli_close ( $link );
while ( $row = mysqli_fetch_assoc ( $res ))
$deadinsts [] = $row [ 'URI' ];
unset ( $res );
gecho ( 'Loaded list of dead instances (' . count ( $deadinsts ) . ').' . N , true , false );
}
$insts = [];
$ainsts = [];
$cinsts = [];
$exarr = [];
if ( $opts [ 'restore' ]) {
if ( file_exists ( $opts [ 'peersfp' ]) && is_file ( $opts [ 'peersfp' ]) && is_readable ( $opts [ 'peersfp' ])) {
gecho ( 'Loading «' . $opts [ 'peersfp' ] . '».' . N , true , false );
$insts = file ( $opts [ 'peersfp' ], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES );
} else {
mexit ( 'Error: couldn’ t open «' . $opts [ 'peersfp' ] . '» for reading.' . N , 1 );
}
}
$peersf =@ fopen ( $opts [ 'peersfp' ], 'w' );
if ( ! $peersf ) mexit ( 'Error: couldn’ t open «' . $opts [ 'peersfp' ] . '» in write mode.' . N , 1 );
$apeersf =@ fopen ( $opts [ 'apeersfp' ], 'w' );
if ( ! $apeersf ) mexit ( 'Error: couldn’ t open «' . $opts [ 'apeersfp' ] . '» in write mode.' . N , 1 );
$cpeersf =@ fopen ( $opts [ 'cpeersfp' ], 'w' );
if ( ! $cpeersf ) mexit ( 'Error: couldn’ t open «' . $opts [ 'cpeersfp' ] . '» in write mode.' . N , 1 );
$tini = time ();
crawl ([ $opts [ 'startinst' ]], 1 );
gecho ( 'Done crawling! :-)' . N , true , false );
shutdown ( true );
$now = time ();
gecho ( 'Crawl started on ' . date ( 'Y-m-d H:i:s' , $tini ) . ' and ended on ' . date ( 'Y-m-d H:i:s' , $now ) . '.' . N , true , false );
gecho ( count ( $ainsts ) . ' URIs checked in ' . ght ( $now - $tini ) . ' (' . $maxround . ' rounds); ' . count ( $insts ) . ' responded. Max. memory usage: ' . ghs ( memory_get_peak_usage ( true )) . '.' . N , true , false );
exit ( 0 );
// functions
function crawl ( $list , $id ) {
global $insts , $deadinsts , $cinsts , $ainsts , $tini , $opts , $peersf , $cpeersf , $apeersf , $maxround ;
gecho ( '###### START OF ROUND ' . $id . ' ######' . N , true , false );
$nlist = [];
$c = count ( $list );
$i = 0 ;
$rtini = time ();
foreach ( $list as $inst ) {
if ( ! in_array ( $inst , $ainsts )) {
$ainsts [] = $inst ;
fwrite ( $apeersf , $inst . N );
}
$i ++ ;
$now = time ();
$rtela = $now - $rtini ;
gecho ( 'Working on «' . $inst . '»: round ' . $id . ', ' . $i . '/' . $c . '; TET: ' . ght ( $now - $tini , null , 0 ) . '; ETR of this round: ' . ght ( $rtela / $i * $c - $rtela , null , 0 ) . '; using ' . ghs ( memory_get_usage ( true )) . ' mem. (peak: ' . ghs ( memory_get_peak_usage ( true )) . '); ' . count ( $insts ) . ' discovered instances; ' . count ( $nlist ) . ' instances in next round list.' . N , true , false );
waituntilonline ();
updexarr ();
gecho ( 'Trying to load «' . $inst . '»’ s peers...' . N , true , false );
$peers = gurl ( 'https://' . $inst . '/api/v1/instance/peers' , $opts [ 'timeout' ], $opts [ 'curltimeout' ]);
$cinsts [] = $inst ; // don't need to chech if in_array
fwrite ( $cpeersf , $inst . N );
if ( $peers [ 'cont' ] === false ) {
gecho ( 'Error loading «' . $inst . '»’ s peers: ' . $peers [ 'emsg' ] . '.' . N , true , true );
} else {
$peers =@ json_decode ( $peers [ 'cont' ], true );
if ( ! is_array ( $peers )) {
gecho ( 'Error loading «' . $inst . '»’ s peers: got not good JSON.' . N , true , true );
} else {
gecho ( 'Successfully loaded «' . $inst . '»’ s peers :-)' . N , true , false );
if ( ! in_array ( $inst , $insts )) {
gecho ( 'Discovered instance «' . $inst . '» :-)' . N , true , false );
$insts [] = $inst ;
fwrite ( $peersf , $inst . N );
}
foreach ( $peers as $peer ) {
if ( ! in_array ( $peer , $ainsts )) {
$ainsts [] = $peer ;
fwrite ( $apeersf , $peer . N );
}
$whynot = [];
if ( in_array ( $peer , $cinsts )) $whynot [] = 'it has already been checked' ;
if ( ! is_string ( $peer )) $whynot [] = 'its name is not a string' ;
if ( ! validhostname ( $peer )) $whynot [] = 'its name is not a valid hostname' ;
if ( ckexarr ( $peer )) $whynot [] = 'its name matches an exclusion regexp' ;
if ( in_array ( $peer , $list )) $whynot [] = 'it is already present in current list' ;
if ( in_array ( $peer , $nlist )) $whynot [] = 'it has already been added to next round list' ;
if ( $opts [ 'excludedead' ] && in_array ( $peer , $deadinsts )) $whynot [] = 'it’ s dead' ;
if ( count ( $whynot ) > 0 ) {
if ( $opts [ 'verbose' ]) gecho ( ' Not adding peer «' . $peer . '» to next round list because ' . implode ( ', ' , $whynot ) . '.' . N , true , true );
} else {
if ( $opts [ 'verbose' ]) gecho ( ' Adding peer «' . $peer . '» to next round list :-)' . N , true , false );
$nlist [] = $peer ;
}
}
}
}
}
if ( count ( $nlist ) > 0 ) {
unset ( $list );
crawl ( $nlist , $id + 1 );
$maxround = $id + 1 ;
} else {
gecho ( 'Next round list is empty.' . N , true , false );
}
gecho ( '###### END OF ROUND ' . $id . ' ######' . N , true , false );
}
2020-10-13 08:21:26 +02:00
function mexit ( $msg , $code ) {
2020-10-14 08:37:41 +02:00
global $link , $lockfp ;
2022-12-17 18:43:13 +01:00
if ( isset ( $link ) && $link !== false ) mysqli_close ( $link );
if ( isset ( $lockfp ) && file_exists ( $lockfp )) unlink ( $lockfp );
2022-07-13 12:45:57 +02:00
if ( $code == 0 )
echo ( $msg );
else
fwrite ( STDERR , $msg );
2020-10-13 08:21:26 +02:00
exit ( $code );
}
2022-07-13 12:45:57 +02:00
function gecho ( $msg , $prtime , $iserr ) {
if ( $prtime )
$msg = microdate () . ' ' . $msg ;
if ( $iserr )
fwrite ( STDERR , $msg );
else
echo ( $msg );
}
2022-12-18 18:42:11 +01:00
function myq ( & $link , $query ) {
2022-12-21 22:07:05 +01:00
try { $res = mysqli_query ( $link , $query ); }
catch ( Exception $error ) { mexit ( 'Error: query «' . $query . '» failed: ' . $error -> getMessage () . ' (' . $error -> getCode () . ').' . N , 2 ); }
2022-12-18 18:42:11 +01:00
// for php versions < 8, which seem to not catch mysql exceptions
2022-12-21 22:07:05 +01:00
if ( $res === false ) mexit ( 'Error: query «' . $query . '» failed: ' . mysqli_error ( $link ) . ' (' . mysqli_errno ( $link ) . ').' . N , 2 );
2022-12-18 18:42:11 +01:00
return ( $res );
}
2022-07-13 12:45:57 +02:00
function microdate ( $time = null ) {
if ( is_null ( $time )) $time = microtime ( false );
$time = explode ( ' ' , $time );
return ( date ( 'Y-m-d H:i:s' , $time [ 1 ]) . '.' . substr ( $time [ 0 ], 2 ));
}
2020-10-13 08:21:26 +02:00
function sortcheckandsave ( & $arr , $arrdesc , & $fp ) {
$buc = count ( $arr );
$arr = array_unique ( $arr );
$auc = count ( $arr );
2022-12-21 22:07:05 +01:00
if ( $buc != $auc ) gecho ( 'Warning: ' . $arrdesc . ' contained duplicates, better check my code ;-)' . N , true , true );
2022-07-13 12:45:57 +02:00
gecho ( 'Saving ordered ' . $arrdesc . ' into «' . $fp . '».' . N , true , false );
2020-10-13 08:21:26 +02:00
sort ( $arr );
$f =@ fopen ( $fp , 'w' );
if ( $f !== false ) {
foreach ( $arr as $val )
fwrite ( $f , $val . N );
fclose ( $f );
} else {
2022-12-21 22:07:05 +01:00
gecho ( 'Error: couldn’ t open «' . $fp . '» for writing.' . N , true , true );
2020-10-13 08:21:26 +02:00
}
}
function shutdown ( $dosort ) {
2022-12-21 22:07:05 +01:00
global $opts , $peersf , $apeersf , $insts , $ainsts , $lockfp ;
2022-12-17 18:43:13 +01:00
if ( isset ( $peersf ) && $peersf !== false ) @ fclose ( $peersf );
2022-12-21 22:07:05 +01:00
if ( isset ( $apeersf ) && $apeersf !== false ) @ fclose ( $apeersf );
if ( isset ( $cpeersf ) && $cpeersf !== false ) @ fclose ( $apeersf );
2020-10-14 08:37:41 +02:00
if ( isset ( $lockfp ) && file_exists ( $lockfp )) unlink ( $lockfp );
2020-10-13 08:21:26 +02:00
if ( $dosort ) {
sortcheckandsave ( $insts , 'list of responding instances' , $opts [ 'peersfp' ]);
2022-12-21 22:07:05 +01:00
sortcheckandsave ( $cinsts , 'list of checked instances' , $opts [ 'cpeersfp' ]);
sortcheckandsave ( $ainsts , 'list of all instances' , $opts [ 'apeersfp' ]);
2020-10-13 08:21:26 +02:00
}
}
// ATTENZIONE: se lo script viene pipato, tipo "script.php | tee script.log",
// la funzione viene eseguita, anche se l'output della stessa non viene mostrato
function signalHandler ( $signal ) {
2022-12-21 22:07:05 +01:00
echo ( N . 'Interrupted (signal: ' . $signal . ').' . N );
2020-10-13 08:21:26 +02:00
shutdown ( false );
//touch('KILLED');
2020-10-14 08:37:41 +02:00
exit ( 3 );
2020-10-13 08:21:26 +02:00
}
function isempty ( $val ) {
if ( preg_match ( '/^\s*$/' , $val ) === 1 )
return ( true );
else
return ( false );
}
function waituntilonline () {
$url = 'www.google.com' ;
2022-12-21 22:07:05 +01:00
$gotoff = false ;
2020-10-13 08:21:26 +02:00
while ( false === ( $f =@ fsockopen ( $url , 80 , $errno , $errstr , 1 ))) {
2022-12-21 22:07:05 +01:00
$gotoff = true ;
gecho ( 'Warning: it seems we are offline, waiting 30 seconds before retrying :-(' . N , true , true );
sleep ( 30 );
2020-10-13 08:21:26 +02:00
}
fclose ( $f );
2022-12-21 22:07:05 +01:00
if ( $gotoff ) gecho ( 'It seems we are back online! :-)' . N , true , false );
2020-10-13 08:21:26 +02:00
}
function updexarr () {
global $exarr , $opts ;
if ( ! is_null ( $opts [ 'excludefp' ])) {
$f =@ fopen ( $opts [ 'excludefp' ], 'r' );
if ( $f !== false ) {
$i = 0 ;
2022-12-21 22:07:05 +01:00
$exarr = [];
2020-10-13 08:21:26 +02:00
while ( ! feof ( $f )) {
$i ++ ;
$line = trim ( fgets ( $f ));
if ( ! isempty ( $line )) {
if ( @ preg_match ( $line , 'foo' ) !== false )
$exarr [] = $line ;
else
2022-07-13 12:45:57 +02:00
gecho ( 'WARNING: «' . $opts [ 'excludefp' ] . '», line ' . $i . ': «' . $line . '» is not a valid regular expression.' . N , true , true );
2020-10-13 08:21:26 +02:00
}
}
} else {
2022-07-13 12:45:57 +02:00
gecho ( 'WARNING: I could not open «' . $opts [ 'excludefp' ] . '» for reading.' . N , true , true );
2020-10-13 08:21:26 +02:00
}
}
}
function ckexarr ( $inst ) {
global $exarr ;
foreach ( $exarr as $re )
if ( preg_match ( $re , $inst ) === 1 ) return ( true );
return ( false );
}
2022-11-11 21:57:30 +01:00
function ismultibyte ( $s ) {
preg_replace ( '/./u' , '.' , $s , - 1 , $c );
( strlen ( $s ) > $c ) ? $r = true : $r = false ;
return ( $r );
}
2022-12-18 18:42:11 +01:00
function validhostname ( $hostname ) {
//$hostname=preg_replace('#/.*#','',$hostname);
2022-12-18 11:42:32 +01:00
//$hostname=preg_replace('#:[0-9]+$#','',$hostname);
2022-12-21 22:15:40 +01:00
if ( ismultibyte ( $hostname )) $hostname = idn_to_ascii ( $hostname , IDNA_DEFAULT , INTL_IDNA_VARIANT_UTS46 );
2022-11-11 21:57:30 +01:00
//echo($hostname.N);
2020-10-13 08:21:26 +02:00
if ( strlen ( $hostname ) > 253 ) return ( false );
$labels = explode ( '.' , $hostname );
foreach ( $labels as $label ) {
$len = strlen ( $label );
if ( $len < 1 || $len > 63 ) return ( false );
if ( preg_match ( '#^-#' , $label ) == 1 ) return ( false );
if ( preg_match ( '#-$#' , $label ) == 1 ) return ( false );
//if (preg_match('#--#',$label)==1) return(false);
if ( preg_match ( '#^[a-zA-Z0-9-]+$#' , $label ) !== 1 ) return ( false );
}
return ( true );
}
//$url='www.team.starschlep.com/'; if (validhostname($url)) echo('OK: '.$url.N); else echo('KO: '.$url.N); die();
?>