2020-05-08 18:03:11 +02:00
#!/usr/bin/php
2020-05-02 19:59:53 +02:00
< ? php
/*
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http :// www . gnu . org / licenses />.
*/
setlocale ( LC_ALL , getenv ( 'LANG' ));
define ( 'N' , " \n " );
$opts = array (
'startinst' => 'mastodon.social' ,
'peersfp' => 'peers' ,
'restore' => false ,
'excludefp' => null
);
$help = ' peerscrawl . php
DESCRIPTION
This program tries to build a fairly complete list of mastodon instances .
SYNOPSIS
peerscrawl . php [ options ]
OPTIONS
- s , -- startinst < domain >
Defines the first instance to crawl .
DEFAULT : « '.$opts[' startinst '].' »
- p , -- peersfp < file >
Defines the file into which the ordered list of instances will be saved .
DEFAULT : « '.$opts[' peersfp '].' »
- r , -- restore
If peers file already exists on program’ s start it will be loaded into
memory and each instance it contains will be considered “already
crawled” , thus allowing to “restore an interrupted crawling session” .
- e , -- excludefp < file >
Defines a file containing exclusion rules : one regular expression per
line ( empty lines are ignored ) . Any instance matching any defined regex
will be ignored by the program . Changes made to this file during program
execution will be taken into account .
This program comes with ABSOLUTELY NO WARRANTY ; for details see the source .
This is free software , and you are welcome to redistribute it under
certain conditions ; see < http :// www . gnu . org / licenses /> for details . ' . N ;
for ( $i = 1 ; $i < $argc ; $i ++ ) {
if ( substr ( $argv [ $i ], 0 , 1 ) == '-' ) {
switch ( $argv [ $i ]) {
case '-s' :
case '--startinst' :
if ( $i + 1 >= $argc )
mexit ( 'Option «' . $argv [ $i ] . '» has to be followed by a domain name (use «-h» for more info).' . N , 1 );
$i ++ ;
$opts [ 'startinst' ] = $argv [ $i ];
break ;
case '-p' :
case '--peersfp' :
if ( $i + 1 >= $argc )
mexit ( 'Option «' . $argv [ $i ] . '» has to be followed by a file’ s path (use «-h» for more info).' . N , 1 );
$i ++ ;
$opts [ 'peersfp' ] = $argv [ $i ];
break ;
case '-r' :
case '--restore' :
$opts [ 'restore' ] = true ;
$i ++ ;
break ;
case '-e' :
case '--excludefp' :
if ( $i + 1 >= $argc || ! file_exists ( $argv [ $i + 1 ]) || ! is_file ( $argv [ $i + 1 ]) || ! is_readable ( $argv [ $i + 1 ]))
mexit ( 'Option «' . $argv [ $i ] . '» has to be followed by an existing, readable file’ s path (use «-h» for more info).' . N , 1 );
$i ++ ;
$opts [ 'excludefp' ] = $argv [ $i ];
break ;
case '-h' :
case '--help' :
mexit ( $help , 0 );
break ;
default :
mexit ( 'Option «' . $argv [ $i ] . '» is unknown (use «-h» for more info).' . N , 1 );
break ;
}
}
}
function mexit ( $msg , $code ) {
echo ( $msg );
exit ( $code );
}
2020-05-09 00:02:10 +02:00
function shutdown ( $dosort ) {
2020-05-02 19:59:53 +02:00
global $opts , $peersf , $insts ;
if ( $peersf ) @ fclose ( $peersf );
2020-05-09 00:02:10 +02:00
if ( $dosort ) {
echo ( 'Saving ordered instances list into «' . $opts [ 'peersfp' ] . '».' . N );
sort ( $insts );
$peersf =@ fopen ( $opts [ 'peersfp' ], 'w' );
if ( $peersf !== false ) {
foreach ( $insts as $inst )
fwrite ( $peersf , $inst . N );
fclose ( $peersf );
} else {
echo ( 'Couldn’ t open «' . $opts [ 'peersfp' ] . '» for writing.' . N );
}
2020-05-02 19:59:53 +02:00
}
}
declare ( ticks = 1 );
function signalHandler ( $signal ) {
echo ( N . 'I got interrupted (signal: ' . $signal . ').' . N );
2020-05-09 00:02:10 +02:00
shutdown ( false );
2020-05-02 19:59:53 +02:00
exit ( 2 );
}
pcntl_signal ( SIGTERM , 'signalHandler' ); // Termination ('kill' was called)
pcntl_signal ( SIGHUP , 'signalHandler' ); // Terminal log-out
pcntl_signal ( SIGINT , 'signalHandler' ); // Interrupted (Ctrl-C is pressed)
$contextopts = array (
'http' => array (
'timeout' => 3
),
'socket' => array (
'tcp_nodelay' => true
)
);
$context = stream_context_create ( $contextopts );
$insts = array ();
$exarr = array ();
if ( $opts [ 'restore' ]) {
if ( file_exists ( $opts [ 'peersfp' ]) && is_file ( $opts [ 'peersfp' ]) && is_readable ( $opts [ 'peersfp' ])) {
echo ( 'Loading «' . $opts [ 'peersfp' ] . '».' . N );
$insts = file ( $opts [ 'peersfp' ], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES );
} else {
echo ( 'WARNING: I couldn’ t open «' . $opts [ 'peersfp' ] . '» for reading.' . N );
}
}
$peersf =@ fopen ( $opts [ 'peersfp' ], 'w' );
function isempty ( $val ) {
if ( preg_match ( '/^\s*$/' , $val ) === 1 )
return ( true );
else
return ( false );
}
function waituntilonline () {
global $context ;
$url = 'www.google.com' ;
while ( false === ( $f =@ fsockopen ( $url , 80 , $errno , $errstr , 1 ))) {
echo ( strftime ( '%c' ) . ' - WARNING: it seems we are offline :-(' . N );
sleep ( 5 );
}
fclose ( $f );
// echo(strftime('%c').' - it seems we are online! :-)'.N);
}
function updexarr () {
global $exarr , $opts ;
if ( ! is_null ( $opts [ 'excludefp' ])) {
$f =@ fopen ( $opts [ 'excludefp' ], 'r' );
if ( $f !== false ) {
$i = 0 ;
$exarr = array ();
while ( ! feof ( $f )) {
$i ++ ;
$line = trim ( fgets ( $f ));
if ( ! isempty ( $line )) {
if ( @ preg_match ( $line , 'foo' ) !== false )
$exarr [] = $line ;
else
echo ( 'WARNING: «' . $opts [ 'excludefp' ] . '», line ' . $i . ': «' . $line . '» is not a valid regular expression.' . N );
}
}
} else {
echo ( 'WARNING: I could not open «' . $opts [ 'excludefp' ] . '» for reading.' . N );
}
}
}
function ckexarr ( $inst ) {
global $exarr ;
foreach ( $exarr as $re )
if ( preg_match ( $re , $inst ) === 1 ) return ( true );
return ( false );
}
function crawl ( $inst ) {
global $insts , $peersf , $context ;
waituntilonline ();
updexarr ();
if ( ! isempty ( $inst )) {
2020-05-09 06:20:06 +02:00
// questo qui sotto è ridondante su un lancio normale, ma serve per evitare che l'istanza di partenza finisca nella lista 2 volte quando lo si lancia con "--restore"
2020-05-02 19:59:53 +02:00
if ( ! in_array ( $inst , $insts )) {
echo ( '«' . $inst . '» is not a known instance, I add it to the list of known instances.' . N );
$insts [] = $inst ;
fwrite ( $peersf , $inst . N );
}
echo ( '«' . $inst . '»: trying to load instance’ s peers ... ' );
$peers =@ file_get_contents ( 'https://' . $inst . '/api/v1/instance/peers' , false , $context );
if ( $peers != false ) {
echo ( 'OK :-)' . N );
$peers = json_decode ( $peers , true );
if ( is_array ( $peers )) {
foreach ( $peers as $peer ) {
if ( is_string ( $peer )) {
if ( ! ckexarr ( $peer )) {
if ( ! in_array ( $peer , $insts )) {
echo ( '>>> I will crawl «' . $peer . '».' . N );
crawl ( $peer );
} /* else {
echo ( '>>> I won’ t crawl «' . $peer . '» because I already did.' . N );
} */
} else {
echo ( '>>> I won’ t crawl «' . $peer . '» because its name matches with an exclusion regex.' . N );
}
} else {
echo ( '>>> I won’ t crawl this peer because its name is not a string.' . N );
}
}
}
} else {
echo ( 'ERROR :-(' . N );
}
} else {
echo ( 'I WON’ T add nameless instances.' . N );
}
echo ( '~~~~~~~ Stats: ' . count ( $insts ) . ' known istances ~~~~~~~' . N );
}
crawl ( $opts [ 'startinst' ]);
echo ( 'DONE CRAWLING! :-)' . N );
2020-05-09 00:02:10 +02:00
shutdown ( true );
2020-05-02 19:59:53 +02:00
exit ( 0 );
?>