1
0
Fork 0
MastodonStartpage/web/admin/crawler/crawler.php

459 lines
17 KiB
PHP
Raw Normal View History

2019-12-01 09:07:45 +01:00
#!/bin/php
<?php
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('N',"\n");
2019-12-26 21:57:36 +01:00
declare(ticks=1);
pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
function signalHandler($signal) {
global $link, $logf, $jsonf;
lecho(N.'Sono stato interrotto.'.N);
if ($link) {
lecho('La connessione MySQL è aperta, la chiudo.'.N);
mysqli_close($link);
}
if ($jsonf) {
echo('Il file di dump json è aperto, lo chiudo.'.N);
fwrite($jsonf,'"Fine?": true'.N.'}'.N);
fclose($jsonf);
}
if ($logf) {
echo('Il file di log è aperto, lo chiudo.'.N);
fclose($logf);
}
exit(2);
}
$opts=array(
2019-12-26 21:57:36 +01:00
'timeout'=>3,
'log'=>true,
'jsonfp'=>'instances.json',
'jsonwrite'=>true,
'jsonread'=>false
);
2019-12-01 09:07:45 +01:00
2019-12-26 21:57:36 +01:00
use function mysqli_real_escape_string as myesc;
2019-12-01 09:07:45 +01:00
function tosec($str) {
if (preg_match('/^([0-9]+)([smogSMA]?)/',$str,$buf)===1) {
switch ($buf[2]) {
case '':
case 's':
return($buf[1]);
break;
case 'm':
return($buf[1]*60);
break;
case 'o':
return($buf[1]*60*60);
break;
case 'g':
return($buf[1]*60*60*24);
break;
case 'S':
return($buf[1]*60*60*24*7);
break;
case 'M':
2019-12-06 14:49:34 +01:00
return($buf[1]*60*60*24*30);
break;
case 'A':
2019-12-06 14:49:34 +01:00
return($buf[1]*60*60*24*365);
break;
}
} else {
return(false);
}
}
2019-12-01 09:07:45 +01:00
2019-12-26 21:57:36 +01:00
function mexit($msg,$code,$closemy=false) {
global $link;
lecho($msg);
if ($closemy)
mysqli_close($link);
if ($logf)
fclose($logf);
exit($code);
2019-12-01 09:07:45 +01:00
}
2019-12-26 21:57:36 +01:00
function lecho($msg,$logonly=false) {
global $opts, $logf;
if (!$logonly)
echo($msg);
if ($opts['log'])
fwrite($logf,$msg);
2019-12-01 09:07:45 +01:00
}
2019-12-26 21:57:36 +01:00
$logfp='crawler.log';
if ($opts['log']) {
$logf=@fopen(__DIR__.'/'.$logfp,'w')
or mexit('Non ho potuto aprire in scrittura il file di log «'.$logfp.'».',1);
2019-12-01 09:07:45 +01:00
}
2019-12-26 21:57:36 +01:00
$inifp='../sec/mastostartadmin.ini';
$iniarr=parse_ini_file($inifp)
or mexit('Impossibile aprire il file di configurazione «'.$inifp.'»'.N,1);
$link=mysqli_connect($iniarr['db_host'],$iniarr['db_admin_name'],$iniarr['db_admin_password'],$iniarr['db_name'],$iniarr['db_port'],$iniarr['db_socket'])
or mexit(mysqli_error($link).N,1);
mysqli_set_charset($link,'utf8');
$contextopts=array(
'http'=>array(
'timeout'=>$opts['timeout']
),
'socket'=>array(
'tcp_nodelay'=>true
)
);
$context=stream_context_create($contextopts);
$blacklist=array();
lecho('Carico la blacklist dal database...'.N);
$res=mysqli_query($link,'SELECT * FROM Blacklist')
or mexit(mysqli_error($link).N,3,true);
lecho(mysqli_num_rows($res).' istanze nella blacklist.'.N);
while($row=mysqli_fetch_assoc($res)) {
$blacklist[$row['Domain']]=$row;
2019-12-01 09:07:45 +01:00
}
2019-12-26 21:57:36 +01:00
function pgdatetomy($pgdate) {
if (preg_match('/^(\d+)-(\d+)-(\d+)[ T]{1}(\d+):(\d+):(\d+)\.(\d+)Z?$/',$pgdate,$buf)===1) {
return(mktime($buf[4],$buf[5],$buf[6],$buf[2],$buf[3],$buf[1])+floatval('0.'.$buf[7]));
2019-12-15 17:06:02 +01:00
} else {
2019-12-26 21:57:36 +01:00
return(false);
2019-12-15 17:06:02 +01:00
}
}
2019-12-26 21:57:36 +01:00
function blpgdumplinetomy($line) {
$truefalse=array('f'=>0,'t'=>1);
$row=explode("\t",$line);
$row=array('Domain'=>$row[0],
'CreatedAt'=>pgdatetomy($row[1]),
'ModifiedAt'=>pgdatetomy($row[2]),
'Severity'=>$row[3],
'RejectMedia'=>$truefalse[$row[4]],
'RejectReports'=>$truefalse[$row[5]],
'PublicComment'=>$row[6]);
return($row);
2019-12-17 13:19:12 +01:00
}
2019-12-26 21:57:36 +01:00
$blacklistnew=array();
$insts=array();
lecho('Carico le istanze di partenza...'.N);
$res=mysqli_query($link,'SELECT Domain FROM StartNodes')
or mexit(mysqli_error($link).N,3,true);
lecho(mysqli_num_rows($res).' istanze di partenza.'.N);
while($row=mysqli_fetch_assoc($res)) {
$insts[$row['Domain']]=null;
lecho('Recupero la lista delle istanze note a «'.$row['Domain'].'» ... ');
$buf=@file_get_contents('https://'.$row['Domain'].'/api/v1/instance/peers',false,$context);
if ($buf!==false) {
lecho('OK :-)'.N);
$peers=json_decode($buf,true);
foreach ($peers as $pdom) {
if (!array_key_exists($pdom,$insts) && strlen($pdom)<=64) {
$insts[$pdom]=null;
}
}
2019-12-17 13:19:12 +01:00
} else {
2019-12-26 21:57:36 +01:00
lecho('ERRORE :-('.N);
2019-12-17 13:19:12 +01:00
}
2019-12-26 21:57:36 +01:00
lecho('Recupero la blacklist di «'.$row['Domain'].'» ... ');
$buf=@file_get_contents('https://'.$row['Domain'].'/domain_blocks.txt',false,$context);
if ($buf!==false) {
2019-12-26 21:57:36 +01:00
lecho('OK :-)'.N);
$buf=explode(N,$buf);
2019-12-26 21:57:36 +01:00
foreach ($buf as $line) {
if (preg_match('/(^#.*$)|(^\s*$)/',$line)===0) {
$brow=blpgdumplinetomy($line);
if (!array_key_exists($brow['Domain'],$blacklist)) {
$blacklistnew[$brow['Domain']]=$brow;
}
$blacklist[$brow['Domain']]=$brow;
}
2019-12-01 09:07:45 +01:00
}
} else {
2019-12-26 21:57:36 +01:00
lecho('ERRORE :-('.N);
2019-12-01 09:07:45 +01:00
}
2019-12-17 13:19:12 +01:00
}
2019-12-26 21:57:36 +01:00
//lecho('Carico le istanze note dal DB e aggiungo alla lista di quelle da controllare quelle che non ci sono già.'.N);
$res=mysqli_query($link,'SELECT URI FROM Instances')
or mexit(mysqli_error($link).N,3,true);
while($row=mysqli_fetch_assoc($res)) {
if (!array_key_exists($row['URI'],$insts))
$insts[$row['URI']]=null;
}
ksort($insts);
ksort($blacklist);
ksort($blacklistnew);
lecho('Istanze recuperate: '.count($insts).N);
lecho('Istanze blacklistate: '.count($blacklist).', di cui '.count($blacklistnew).' nuove da aggiungere al DB.'.N);
foreach ($blacklistnew as $row) {
foreach($row as $key=>$val)
$row[$key]=myesc($link,$val);
mysqli_query($link,'INSERT INTO Blacklist (ID, Domain, CreatedAt, ModifiedAt, Severity, RejectMedia, RejectReports, PrivateComment, PublicComment) VALUES (NULL, \''.$row['Domain'].'\', \''.$row['CreatedAt'].'\', \''.$row['ModifiedAt'].'\', \''.$row['Severity'].'\', \''.$row['RejectMedia'].'\', \''.$row['RejectReports'].'\', NULL, \''.$row['PublicComment'].'\')')
or mexit(mysqli_error($link).N,3,true);
}
2019-12-26 21:57:36 +01:00
//INSERT INTO `Instances` (`ID`, `New`, `Chosen`, `Visible`, `BlackListed`, `URI`, `Title`, `ShortDesc`, `LongDesc`, `OurDesc`, `PlaceID`, `Email`, `Software`, `Version`, `UserCount`, `StatusCount`, `DomainCount`, `ActiveUsersMonth`, `ActiveUsersHalfYear`, `Thumb`, `RegOpen`, `RegReqApproval`, `MaxTootChars`, `AdmAccount`, `AdmDisplayName`, `AdmCreatedAt`, `AdmNote`, `AdmURL`, `AdmAvatar`, `AdmHeader`) VALUES (NULL, '1', '0', '0', '0', 'pantagruel.dnsup.net', 'Pantagruel', 'Descrizione breve', 'Descrizione lunga', 'Istanza molto carina senza soffitto, senza cucina', '1', 'Graume <graume@inventati.org>', 'mastodon', '3.0.1', '2', '12', '345', '5', '10', 'http://www.iedm.it', '1', '0', '540', 'admin', 'Admin', '2019-12-11', 'Note \'admin\'', 'https://rame.altervista.org', 'http://www.iedm.it', 'http://www.iedm.it');
2019-12-15 17:06:02 +01:00
2019-12-26 21:57:36 +01:00
function b2i($bool) {
if ($bool)
return(1);
else
return(0);
}
2019-12-26 21:57:36 +01:00
//array key exists and value is not null
function akeavinn($key,&$arr) {
if (array_key_exists($key,$arr) && !is_null($arr[$key]))
return(true);
else
return(false);
2019-12-01 09:07:45 +01:00
}
2019-12-26 21:57:36 +01:00
function nempty($str) {
if (preg_match('/^\s*$/',$str)===1)
return(null);
else
return($str);
}
2019-12-15 17:06:02 +01:00
2019-12-26 21:57:36 +01:00
function subarim($glue,$key,&$arr) {
$str='';
$i=1;
$carr=count($arr);
foreach ($arr as $inarr) {
$str.=$inarr[$key];
if ($i<$carr)
$str.=$glue;
$i++;
}
return($str);
}
2019-12-17 13:19:12 +01:00
2019-12-26 21:57:36 +01:00
function notify($msg,$sev) {
global $link;
mysqli_query($link,'INSERT INTO Notifications (ID, Notification, Severity, Microtime) VALUES (NULL, \''.myesc($link,$msg).'\', '.$sev.', '.microtime().')')
or mexit(mysqli_error($link).N,3,true);
}
2019-12-15 17:06:02 +01:00
2019-12-26 21:57:36 +01:00
/*
* Nodeinfo ('https://'.$dom.'/nodeinfo/2.0') è stato aggiunto nella 3.0.0
* Trends ('https://'.$dom.'/api/v1/trends') è stato aggiunto nella 3.0.0
* Activity ('https://'.$dom.'/api/v1/instance/activity') è stato aggiunto nella 2.1.2
*/
2019-12-17 13:19:12 +01:00
2019-12-26 21:57:36 +01:00
if ($opts['jsonwrite']) {
$jsonf=@fopen(__DIR__.'/'.$opts['jsonfp'],'w')
or mexit('Non ho potuto aprire in scrittura il file di dump delle info json «'.$opts['jsonfp'].'».',1);
fwrite($jsonf,'{'.N);
}
$cinsts=count($insts);
$i=0;
$ok=0;
foreach ($insts as $dom=>$row) {
$i++;
$info=null;
lecho('~~~~~~~~~~~~~~~'.N);
lecho('Provo a recuperare info su «'.$dom.'» ['.$i.'/'.$cinsts.' ('.$ok.' OK) - '.round(100/$cinsts*$i).'%]'.N);
lecho('Provo a recuperare le informazioni API sullistanza ... ');
$buf=@file_get_contents('https://'.$dom.'/api/v1/instance',false,$context);
if ($buf!==false) {
$ok++;
lecho('OK :-)'.N);
$info=json_decode($buf,true);
if (array_key_exists('version',$info)) {
if ($info['version']>='2.1.2') {
lecho('Provo a recuperare le informazioni API sullattività dellistanza ... ');
$buf=@file_get_contents('https://'.$dom.'/api/v1/instance/activity',false,$context);
if ($buf!==false) {
lecho('OK :-)'.N);
$info['x-activity']=json_decode($buf,true);
} else {
lecho('ERRORE :-('.N);
2019-12-17 13:19:12 +01:00
}
}
2019-12-26 21:57:36 +01:00
if ($info['version']>='3.0.0') {
lecho('Provo a recuperare le informazioni Nodeinfo sullistanza ... ');
$buf=@file_get_contents('https://'.$dom.'/nodeinfo/2.0',false,$context);
if ($buf!==false) {
lecho('OK :-)'.N);
$info['x-nodeinfo']=json_decode($buf,true);
} else {
lecho('ERRORE :-('.N);
}
lecho('Provo a recuperare le informazioni API sui trends dellistanza ... ');
$buf=@file_get_contents('https://'.$dom.'/api/v1/trends',false,$context);
if ($buf!==false) {
lecho('OK :-)'.N);
$info['x-trends']=json_decode($buf,true);
} else {
lecho('ERRORE :-('.N);
}
}
2019-12-01 09:07:45 +01:00
}
} else {
2019-12-26 21:57:36 +01:00
lecho('ERRORE :-('.N);
}
2019-12-26 21:57:36 +01:00
if (!is_null($info) && akeavinn('uri',$info) && !is_null(nempty($info['uri']))) {
lecho(json_encode($info,JSON_PRETTY_PRINT).N,true);
if ($opts['jsonwrite'])
fwrite($jsonf,'"'.$info['uri'].'": '.json_encode($info,JSON_PRETTY_PRINT).','.N);
//INSERT INTO `Instances` (`ID`, `New`, `Chosen`, `Visible`, `BlackListed`, `URI`, `Title`, `ShortDesc`, `LongDesc`, `OurDesc`, `PlaceID`, `Email`, `Software`, `Version`, `UserCount`, `StatusCount`, `DomainCount`, `ActiveUsersMonth`, `ActiveUsersHalfYear`, `Thumb`, `RegOpen`, `RegReqApproval`, `MaxTootChars`, `AdmAccount`, `AdmDisplayName`, `AdmCreatedAt`, `AdmNote`, `AdmURL`, `AdmAvatar`, `AdmHeader`) VALUES (NULL, '1', '0', '0', '0', 'pantagruel.dnsup.net', 'Pantagruel', 'Descrizione breve', 'Descrizione lunga', 'Istanza molto carina senza soffitto, senza cucina', '1', 'Graume <graume@inventati.org>', 'mastodon', '3.0.1', '2', '12', '345', '5', '10', 'http://www.iedm.it', '1', '0', '540', 'admin', 'Admin', '2019-12-11', 'Note \'admin\'', 'https://rame.altervista.org', 'http://www.iedm.it', 'http://www.iedm.it');
$instrow=array('ID'=>null, 'New'=>0, 'Chosen'=>0, 'Visible'=>0, 'BlackListed'=>0, 'URI'=>null, 'Title'=>null, 'ShortDesc'=>null, 'LongDesc'=>null, 'OurDesc'=>null, 'PlaceID'=>null, 'Email'=>null, 'Software'=>null, 'Version'=>null, 'UserCount'=>null, 'StatusCount'=>null, 'DomainCount'=>null, 'ActiveUsersMonth'=>null, 'ActiveUsersHalfYear'=>null, 'Thumb'=>null, 'RegOpen'=>null, 'RegReqApproval'=>null, 'MaxTootChars'=>null, 'AdmAccount'=>null, 'AdmDisplayName'=>null, 'AdmCreatedAt'=>null, 'AdmNote'=>null, 'AdmURL'=>null, 'AdmAvatar'=>null, 'AdmHeader'=>null);
if (array_key_exists($info['uri'],$blacklist))
$instrow['BlackListed']=1;
$instrow['URI']=nempty($info['uri']);
if (akeavinn('title',$info))
$instrow['Title']=nempty($info['title']);
if (akeavinn('short_description',$info))
$instrow['ShortDesc']=nempty($info['short_description']);
if (akeavinn('description',$info))
$instrow['LongDesc']=nempty($info['description']);
if (akeavinn('email',$info))
$instrow['Email']=nempty($info['email']);
if (akeavinn('version',$info))
$instrow['Version']=nempty($info['version']);
if (akeavinn('stats',$info)) {
if (akeavinn('user_count',$info['stats']))
$instrow['UserCount']=$info['stats']['user_count'];
if (akeavinn('status_count',$info['stats']))
$instrow['StatusCount']=$info['stats']['status_count'];
if (akeavinn('domain_count',$info['stats']))
$instrow['DomainCount']=$info['stats']['domain_count'];
}
if (akeavinn('thumbnail',$info))
$instrow['Thumb']=nempty($info['thumbnail']);
if (akeavinn('max_toot_chars',$info))
$instrow['MaxTootChars']=$info['max_toot_chars'];
if (akeavinn('registrations',$info))
$instrow['RegOpen']=b2i($info['registrations']);
if (akeavinn('approval_required',$info))
$instrow['RegReqApproval']=b2i($info['approval_required']);
if (akeavinn('contact_account',$info)) {
if (akeavinn('acct',$info['contact_account']))
$instrow['AdmAccount']=nempty($info['contact_account']['acct']);
if (akeavinn('display_name',$info['contact_account']))
$instrow['AdmDisplayName']=nempty($info['contact_account']['display_name']);
if (akeavinn('created_at',$info['contact_account']))
$instrow['AdmCreatedAt']=pgdatetomy($info['contact_account']['created_at']);
if (akeavinn('note',$info['contact_account']))
$instrow['AdmNote']=nempty(strip_tags($info['contact_account']['note'],'<a>'));
if (akeavinn('url',$info['contact_account']))
$instrow['AdmURL']=nempty($info['contact_account']['url']);
if (akeavinn('avatar',$info['contact_account']))
$instrow['AdmAvatar']=nempty($info['contact_account']['avatar']);
if (akeavinn('header',$info['contact_account']))
$instrow['AdmHeader']=nempty($info['contact_account']['header']);
}
if (akeavinn('x-nodeinfo',$info)) {
if (akeavinn('software',$info['x-nodeinfo']) && akeavinn('name',$info['x-nodeinfo']['software']))
$instrow['Software']=nempty($info['x-nodeinfo']['software']['name']);
if (akeavinn('usage',$info['x-nodeinfo']) && akeavinn('users',$info['x-nodeinfo']['usage'])) {
if (akeavinn('activeMonth',$info['x-nodeinfo']['usage']['users']))
$instrow['ActiveUsersMonth']=$info['x-nodeinfo']['usage']['users']['activeMonth'];
if (akeavinn('activeHalfyear',$info['x-nodeinfo']['usage']['users']))
$instrow['ActiveUsersHalfYear']=$info['x-nodeinfo']['usage']['users']['activeHalfyear'];
2019-12-17 13:19:12 +01:00
}
2019-12-26 21:57:36 +01:00
}
$res=mysqli_query($link,'SELECT * FROM Instances WHERE URI=\''.myesc($link,$instrow['URI']).'\'')
or mexit(mysqli_error($link).N,3,true);
if (mysqli_num_rows($res)>0) {
lecho('«'.$instrow['URI'].'» è già presente nel DB, la aggiorno...'.N);
$oldinstrow=mysqli_fetch_assoc($res);
$query='UPDATE Instances SET ';
foreach ($instrow as $field=>$value) {
if (!is_null($value))
$query.=$field.'=\''.myesc($link,$value).'\', ';
else
$query.=$field.'=\'NULL\', ';
2019-12-17 13:19:12 +01:00
}
2019-12-26 21:57:36 +01:00
$query=substr($query,0,-2).' WHERE Instances.ID='.$oldinstrow['ID'];
echo('QUERONA DI UPDATE: «'.$query.'».'.N);
/* $res=mysql_query($link,'SELECT InstID, LangID, Pos, Code FROM InstLangs LEFT JOIN Languages ON Languages.ID=LangID WHERE InstID='.$oldinstrow['ID'].' ORDER BY Pos ASC')
or mexit(mysqli_error($link).N,3,true);
$oldinstlangs=array();
while ($row=mysql_fetch_assoc($res))
$oldinstlangs[]=$row;
if (akeavinn('languages',$info)) {
$instlangs=array();
$pos=0;
foreach ($info['languages'] as $lang) {
$res=mysqli_query($link,'SELECT * FROM Languages WHERE Code=\''.myesc($link,$lang).'\'')
or mexit(mysqli_error($link).N,3,true);
if (mysqli_num_rows($res)<1) {
mysqli_query($link,'INSERT INTO Languages (ID, Code, Name) VALUES (NULL, \''.myesc($link,$lang).'\', NULL)')
or mexit(mysqli_error($link).N,3,true);
$langid=mysqli_insert_id($link);
notify('Laggiornamento dei dati relativi allistanza «<a href="editinst.php?id='.$oldinstrow['ID'].'">'.$info['URI'].'</a>» ha aggiunto un codice lingua non ancora noto, «'.$lang.'», di cui non conosco il nome per esteso. Puoi <a href="editlang.php?id='.$langid.'">editarlo qui</a>.',1);
} else {
$row=mysqli_fetch_assoc($res);
$langid=$row['ID'];
}
$pos++;
$instlangs[]=array('InstID'=>$oldinstrow['ID'],'LangID'=>$langid,'Pos'=>$pos,'Code'=>$lang);
}
print_r($instlangs);
print_r($oldinstlangs);
if ($instlangs!=$oldinstlangs) {
notify('La lista delle lingue utilizzate dichiarate dallistanza «<a href="editinst.php?id='.$oldinstrow['ID'].'">'.$info['URI'].'</a>» è cambiata da «'.subarim(', ','Code',$oldinstlangs).'» a «'.subarim(', ','Code',$oldinstlangs).'».',1);
mysqli_query($link,'DELETE FROM InstLangs WHERE InstID='.$oldinstrow['ID'])
or mexit(mysqli_error($link).N,3,true);
foreach ($instlangs as $row) {
mysqli_query($link,'INSERT INTO InstLangs (InstID, LangID, Pos) VALUES ('.$row['InstID'].', '.$row['LangID'].', '.$row['Pos'].')')
or mexit(mysqli_error($link).N,3,true);
}
}
2019-12-17 13:19:12 +01:00
}*/
} else {
2019-12-26 21:57:36 +01:00
lecho('«'.$info['uri'].'» non è già presente nel DB, la aggiungo...'.N);
$instrow['New']=1;
$fields=array();
$values='';
foreach ($instrow as $field=>$value) {
$fields[]=$field;
if (!is_null($value))
$values.='\''.myesc($link,$value).'\', ';
else
$values.='NULL, ';
}
2019-12-26 21:57:36 +01:00
$values=substr($values,0,-2);
$query='INSERT INTO Instances ('.implode(', ',$fields).') VALUES ('.$values.')';
echo('QUERONA DI INSERT: «'.$query.'»'.N);
2019-12-01 09:07:45 +01:00
}
2019-12-26 21:57:36 +01:00
// var_dump($instrow);
2019-12-01 09:07:45 +01:00
}
}
2019-12-26 21:57:36 +01:00
mysqli_close($link);
2019-12-17 13:19:12 +01:00
2019-12-26 21:57:36 +01:00
if ($opts['jsonwrite']) {
fwrite($jsonf,'"Fine?": true'.N.'}'.N);
fclose($jsonf);
}
2019-12-26 21:57:36 +01:00
if ($opts['log'])
fclose($logf);
exit(0);
2019-12-01 09:07:45 +01:00
?>