This commit is contained in:
pezcurrel 2020-05-02 19:59:53 +02:00
parent 9a6642ad94
commit 463d2eaf72
8 changed files with 268 additions and 22195 deletions

3
.gitignore vendored
View file

@ -5,7 +5,8 @@ web/admin/crawler/instances.json
vendor
composer.lock
appunti.txt
web/admin/crawler/infojsonexample.txt
web/admin/crawler/peers
web/admin/crawler/zzz-materiali/
web/admin/zzz-estemp/
web/admin/zzz-materiali/mastostart_struttura_e_dati_pro_altervista.sql.gz
web/admin/zzz-oldcrawler/

View file

@ -58,10 +58,10 @@ if (function_exists('pcntl_signal')) {
$opts=array(
'timeout'=>3,
'log'=>true,
'log'=>false,
'jsonfp'=>__DIR__.'/instances.json',
'jsonwrite'=>true,
'jsonread'=>false
'jsonwrite'=>false,
'peersfp'=>'peers'
);
use function mysqli_real_escape_string as myesc;
@ -267,13 +267,13 @@ function blpgdumplinetomy($line) {
if (!$riprendi) {
$blacklistnew=array();
$insts=array();
lecho('Carico le istanze di partenza...'.N);
lecho('Carico le istanze di riferimento per le blacklist...'.N);
$res=mysqli_query($link,'SELECT Domain FROM StartNodes')
or mexit(__LINE__.': '.mysqli_error($link).N,3);
lecho(mysqli_num_rows($res).' istanze di partenza.'.N);
lecho(mysqli_num_rows($res).' istanze di riferimento.'.N);
while($row=mysqli_fetch_assoc($res)) {
$insts[]=$row['Domain'];
lecho('Recupero la lista delle istanze note a «'.$row['Domain'].'» ... ');
/*lecho('Recupero la lista delle istanze note a «'.$row['Domain'].'» ... ');
$buf=@file_get_contents('https://'.$row['Domain'].'/api/v1/instance/peers',false,$context);
if ($buf!==false) {
lecho('OK :-)'.N);
@ -286,7 +286,7 @@ if (!$riprendi) {
}
} else {
lecho('ERRORE :-('.N);
}
}*/
lecho('Recupero la blacklist di «'.$row['Domain'].'» ... ');
$buf=@file_get_contents('https://'.$row['Domain'].'/domain_blocks.txt',false,$context);
if ($buf!==false) {
@ -316,7 +316,18 @@ if (!$riprendi) {
}
}
//lecho('Carico le istanze note dal DB e aggiungo alla lista di quelle da controllare quelle che non ci sono già.'.N);
lecho('Carico le istanze di partenza da «'.$opts['peersfp'].'»...'.N);
$peers=@file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
if ($peers===false)
mexit('Non ho potuto aprire in lettura «'.$opts['peersfp'].'».'.N,1);
foreach ($peers as $pdom)
if (!in_array($pdom,$insts))
if (!willtrunc($pdom,'Instances','URI'))
$insts[]=$pdom;
else
lecho('Listanza «'.$pdom.'» non sarà considerata perché il suo dominio è troppo lungo per il campo «URI» della tabella «Instances» nel DB'.N);
lecho('Carico le istanze note dal DB e aggiungo alla lista di quelle da controllare quelle che non ci sono già.'.N);
$res=mysqli_query($link,'SELECT URI FROM Instances')
or mexit(__LINE__.': '.mysqli_error($link).N,3);
while($row=mysqli_fetch_assoc($res)) {
@ -707,7 +718,7 @@ while ($i<$cinsts) {
if ($opts['jsonwrite'])
fwrite($jsonf,'"'.$dom.'": '.json_encode($info,JSON_PRETTY_PRINT).','.N);
}
if ($ok && !is_null($info) && akeavinn('uri',$info) && !is_null(nempty($info['uri'])) && !willtrunc($info['uri'],'Instances','URI') && akeavinn('version',$info) && preg_match('/pleroma|pixelfed/i',$info['version'])===0) {
if ($ok && !is_null($info) && akeavinn('uri',$info) && !is_null(nempty($info['uri'])) && !willtrunc($info['uri'],'Instances','URI') && akeavinn('version',$info) && preg_match('/pleroma|pixelfed/i',$info['version'])!==1) {
$qok++;
$instrow=array('ID'=>null, 'FirstSeen'=>null, 'New'=>0, 'Good'=>0, 'Chosen'=>0, 'Visible'=>0, 'Blacklisted'=>0, 'URI'=>null, 'Title'=>null, 'ShortDesc'=>null, 'LongDesc'=>null, 'OurDesc'=>null, 'OurDescEN'=> null, 'LocalityID'=>null, 'OurLangsLock'=>0, 'Email'=>null, 'Software'=>null, 'Version'=>null, 'UserCount'=>null, 'StatusCount'=>null, 'DomainCount'=>null, 'ActiveUsersMonth'=>null, 'ActiveUsersHalfYear'=>null, 'Thumb'=>null, 'RegOpen'=>null, 'RegReqApproval'=>null, 'MaxTootChars'=>null, 'AdmAccount'=>null, 'AdmDisplayName'=>null, 'AdmCreatedAt'=>null, 'AdmNote'=>null, 'AdmURL'=>null, 'AdmAvatar'=>null, 'AdmHeader'=>null, 'GuestID'=>null, 'LastGuestEdit'=>null);
if (array_key_exists($info['uri'],$blacklist))
@ -854,7 +865,7 @@ while ($i<$cinsts) {
}
}
if ($instrow('OurLangsLock')==0) {
if ($instrow['OurLangsLock']==0) {
$instourlangs=langs($instrow['ID'], $instrow['URI'], true);
if (count($instourlangs)>0) {
mysqli_query($link,'DELETE FROM InstLangs WHERE InstID='.$instrow['ID'])

View file

@ -1,197 +0,0 @@
#!/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
setlocale(LC_ALL,'it_IT.UTF-8');
define('N',"\n");
$contextopts=array(
'http'=>array(
'timeout'=>3
),
'socket'=>array(
'tcp_nodelay'=>true
)
);
$context=stream_context_create($contextopts);
$startinst='mastodon.social';
$exfp='crawlerone.exclude';
$allfp='listaglobale.txt';
$okfp='listamastodon.txt';
$softfp='listasoft.txt';
$allf=@fopen($allfp,'w');
$okf=@fopen($okfp,'w');
$softf=@fopen($softfp,'w');
//$insts=array(array('dom'=>$startinst,'ckd'=>false));
$insts=array();
$okinsts=array();
$softwares=array();
function isempty($val) {
if (preg_match('/^\s*$/',$val)===1)
return(true);
else
return(false);
}
function cdate() {
return(strftime('%a %d %b %Y, %T'));
}
function waituntilonline() {
global $context;
$url='http://www.google.com';
while (@file_get_contents($url,false,$context)===false) {
echo(cdate().' - Pare che siamo offline...'.N);
sleep(5);
}
echo(cdate().' - Pare che siamo online! :-)'.N);
}
function updexarr() {
global $exarr, $exfp;
$exarr=file($exfp,FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
}
function ckexarr($inst) {
global $exarr;
foreach ($exarr as $re)
if (preg_match($re,$inst)===1) return(true);
return(false);
}
function crawl($inst) {
global $insts, $okinsts, $softwares, $allf, $okf, $softf, $context;
waituntilonline();
updexarr();
foreach ($softwares as $key=>$val)
echo('Software «'.$key.'»: '.$val.' istanze.'.N);
if (!isempty($inst)) {
// i check sulla presenza di $inst tra le già scovate e crawlate e quello sull'eventuale esclusione vengon fatti subito prima della chiamata ricorsiva di crawl, sotto
echo('«'.$inst.'» non è presente nella lista delle istanze scovate, la aggiungo.'.N);
$insts[]=$inst;
fwrite($allf,$inst.N);
echo('«'.$inst.'»: provo a recuperare info da Nodeinfo ... ');
$buf=@file_get_contents('https://'.$inst.'/nodeinfo/2.0',false,$context);
if ($buf!=false) {
echo('OK :-)'.N);
echo('«'.$inst.'»: Nodeinfo: controllo che il software sia mastodon ... ');
$buf=json_decode($buf,true);
if (is_array($buf) && array_key_exists('software',$buf) && array_key_exists('name',$buf['software'])) {
if (preg_match('/mastodon/i',$buf['software']['name'])===1) {
echo('SI! :-)'.N);
echo('«'.$inst.'»: il software è mastodon, aggiungo listanza alla lista delle istanze OK! :-)'.N);
$okinsts[]=$inst;
fwrite($okf,$inst.N);
} else {
echo('NO :-('.N);
echo('«'.$inst.'»: il software non è mastodon, NON aggiungo listanza alla lista delle istanze ok :-('.N);
}
$software=strtolower($buf['software']['name']);
if (!isempty($software)) {
if (!array_key_exists($software,$softwares)) {
echo('Ho trovato un software che non mi è ancora noto: «'.$software.'»!'.N);
$softwares[$software]=1;
fwrite($softf,$software.N);
} else {
$softwares[$software]++;
}
}
} else {
echo('ERRORE! :-('.N);
}
} else {
echo('ERRORE :-('.N);
echo('«'.$inst.'»: Nodeinfo non ha risposto, NON aggiungo listanza alla lista delle istanze ok :-('.N);
}
echo('«'.$inst.'»: provo a recuperare la lista delle istanze conosciute allistanza ... ');
$peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context);
if ($peers!=false) {
echo('OK :-)'.N);
$peers=json_decode($peers,true);
if (is_array($peers)) {
foreach ($peers as $peer) {
if (@is_string($peer)) {
if (!ckexarr($peer)) {
if (!in_array($peer,$insts)) {
echo('>>> Crawlo «'.$peer.'».'.N);
crawl($peer);
} else {
echo('>>> NON crawlo «'.$peer.'» perché lho già fatto.'.N);
}
} else {
echo('>>> NON crawlo «'.$peer.'» perché il suo nome corrisponde a unesclusione.'.N);
}
} else {
echo('>>> NON crawlo «'.$peer.'» perché il suo nome non è una stringa.'.N);
}
}
}
} else {
echo('ERRORE :-('.N);
}
} else {
echo('NON aggiungo istanze senza nome.'.N);
}
echo('~~~~~~~ Stats: '.count($insts).' istanze note, '.count($okinsts).' istanze mastodon vive, '.count($softwares).' software trovati. ~~~~~~~'.N);
}
crawl($startinst);
echo('FINE CRAWLING! :-)'.N);
@fclose($allfp);
@fclose($okfp);
@fclose($softfp);
echo('Salvo i risultati (tutte le istanze, istanze ok, softwares) ordinati nei rispettivi file.'.N);
sort($insts);
sort($okinsts);
arsort($softwares,SORT_NUMERIC);
$f=@fopen($allfp,'w');
if ($f!==false) {
foreach ($insts as $inst)
fwrite($f,$inst.N);
fclose($f);
} else {
echo('Non ho potuto aprire in scrittura il file «'.$allfp.'».'.N);
}
$f=@fopen($okfp,'w');
if ($f!==false) {
foreach ($okinsts as $inst)
fwrite($f,$inst.N);
fclose($f);
} else {
echo('Non ho potuto aprire in scrittura il file «'.$okfp.'».'.N);
}
$f=@fopen($softfp,'w');
if ($f!==false) {
foreach ($softwares as $software=>$num)
fwrite($f,$software.' '.$num.N);
fclose($f);
} else {
echo('Non ho potuto aprire in scrittura il file «'.$softfp.'».'.N);
}
exit(0);
?>

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,22 +0,0 @@
bacn 1
corgidon 1
diaspora 26
dolphin 29
epicyon 2
fedimoe 1
friendica 167
groundpolis 3
hackhackhack 1
hubzilla 87
imaginarium 1
mastodon 2253
misskey 97
osada 1
plume 45
prismo 4
pub-relay 1
red 1
reel2bits 4
selective-relay 2
tavern 1
zap 9

245
web/admin/crawler/peerscrawl.php Executable file
View file

@ -0,0 +1,245 @@
#!/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
setlocale(LC_ALL,getenv('LANG'));
define('N',"\n");
$opts=array(
'startinst'=>'mastodon.social',
'peersfp'=>'peers',
'restore'=>false,
'excludefp'=>null
);
$help='peerscrawl.php
DESCRIPTION
This program tries to build a fairly complete list of mastodon instances.
SYNOPSIS
peerscrawl.php [options]
OPTIONS
-s, --startinst <domain>
Defines the first instance to crawl.
DEFAULT: «'.$opts['startinst'].'»
-p, --peersfp <file>
Defines the file into which the ordered list of instances will be saved.
DEFAULT: «'.$opts['peersfp'].'»
-r, --restore
If peers file already exists on programs start it will be loaded into
memory and each instance it contains will be considered “already
crawled”, thus allowing to “restore an interrupted crawling session”.
-e, --excludefp <file>
Defines a file containing exclusion rules: one regular expression per
line (empty lines are ignored). Any instance matching any defined regex
will be ignored by the program. Changes made to this file during program
execution will be taken into account.
This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
This is free software, and you are welcome to redistribute it under
certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
for ($i=1; $i<$argc; $i++) {
if (substr($argv[$i],0,1)=='-') {
switch($argv[$i]) {
case '-s':
case '--startinst':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
$i++;
$opts['startinst']=$argv[$i];
break;
case '-p':
case '--peersfp':
if ($i+1>=$argc)
mexit('Option «'.$argv[$i].'» has to be followed by a files path (use «-h» for more info).'.N,1);
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-r':
case '--restore':
$opts['restore']=true;
$i++;
break;
case '-e':
case '--excludefp':
if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable files path (use «-h» for more info).'.N,1);
$i++;
$opts['excludefp']=$argv[$i];
break;
case '-h':
case '--help':
mexit($help,0);
break;
default:
mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
break;
}
}
}
function mexit($msg,$code) {
echo($msg);
exit($code);
}
function shutdown() {
global $opts, $peersf, $insts;
if ($peersf) @fclose($peersf);
echo('Saving ordered instances list into «'.$opts['peersfp'].'».'.N);
sort($insts);
$peersf=@fopen($opts['peersfp'],'w');
if ($peersf!==false) {
foreach ($insts as $inst)
fwrite($peersf,$inst.N);
fclose($peersf);
} else {
echo('Couldnt open «'.$opts['peersfp'].'» for writing.'.N);
}
}
declare(ticks=1);
function signalHandler($signal) {
echo(N.'I got interrupted (signal: '.$signal.').'.N);
shutdown();
exit(2);
}
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
$contextopts=array(
'http'=>array(
'timeout'=>3
),
'socket'=>array(
'tcp_nodelay'=>true
)
);
$context=stream_context_create($contextopts);
$insts=array();
$exarr=array();
if ($opts['restore']) {
if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
echo('Loading «'.$opts['peersfp'].'».'.N);
$insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
} else {
echo('WARNING: I couldnt open «'.$opts['peersfp'].'» for reading.'.N);
}
}
$peersf=@fopen($opts['peersfp'],'w');
function isempty($val) {
if (preg_match('/^\s*$/',$val)===1)
return(true);
else
return(false);
}
function waituntilonline() {
global $context;
$url='www.google.com';
while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
echo(strftime('%c').' - WARNING: it seems we are offline :-('.N);
sleep(5);
}
fclose($f);
// echo(strftime('%c').' - it seems we are online! :-)'.N);
}
function updexarr() {
global $exarr, $opts;
if (!is_null($opts['excludefp'])) {
$f=@fopen($opts['excludefp'],'r');
if ($f!==false) {
$i=0;
$exarr=array();
while (!feof($f)) {
$i++;
$line=trim(fgets($f));
if (!isempty($line)) {
if (@preg_match($line,'foo')!==false)
$exarr[]=$line;
else
echo('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N);
}
}
} else {
echo('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N);
}
}
}
function ckexarr($inst) {
global $exarr;
foreach ($exarr as $re)
if (preg_match($re,$inst)===1) return(true);
return(false);
}
function crawl($inst) {
global $insts, $peersf, $context;
waituntilonline();
updexarr();
if (!isempty($inst)) {
if (!in_array($inst,$insts)) {
echo('«'.$inst.'» is not a known instance, I add it to the list of known instances.'.N);
$insts[]=$inst;
fwrite($peersf,$inst.N);
}
echo('«'.$inst.'»: trying to load instances peers ... ');
$peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context);
if ($peers!=false) {
echo('OK :-)'.N);
$peers=json_decode($peers,true);
if (is_array($peers)) {
foreach ($peers as $peer) {
if (is_string($peer)) {
if (!ckexarr($peer)) {
if (!in_array($peer,$insts)) {
echo('>>> I will crawl «'.$peer.'».'.N);
crawl($peer);
}/* else {
echo('>>> I wont crawl «'.$peer.'» because I already did.'.N);
}*/
} else {
echo('>>> I wont crawl «'.$peer.'» because its name matches with an exclusion regex.'.N);
}
} else {
echo('>>> I wont crawl this peer because its name is not a string.'.N);
}
}
}
} else {
echo('ERROR :-('.N);
}
} else {
echo('I WONT add nameless instances.'.N);
}
echo('~~~~~~~ Stats: '.count($insts).' known istances ~~~~~~~'.N);
}
crawl($opts['startinst']);
echo('DONE CRAWLING! :-)'.N);
shutdown();
exit(0);
?>