Merge branch 'main' into weblate
This commit is contained in:
commit
f856af3053
4 changed files with 136 additions and 11 deletions
|
@ -63,8 +63,8 @@ if (function_exists('pcntl_signal')) {
|
|||
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
|
||||
}
|
||||
|
||||
$opts=array(
|
||||
'timeout'=>5,
|
||||
$opts=[
|
||||
'timeout'=>10,
|
||||
'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead
|
||||
'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new
|
||||
'ldtoots'=>40,// number of toots to check with the automatic language detection function
|
||||
|
@ -75,8 +75,9 @@ $opts=array(
|
|||
'peersfp'=>null,
|
||||
'dontrestore'=>false,
|
||||
'ignorelock'=>false,
|
||||
'fetchusers'=>false,
|
||||
'moreclauses'=>''
|
||||
);
|
||||
];
|
||||
|
||||
$help='crawler.php
|
||||
DESCRIPTION
|
||||
|
@ -92,6 +93,10 @@ $help='crawler.php
|
|||
which are already present in the database.
|
||||
Note that this option is ignored if the script will recover a previous
|
||||
unfinished session.
|
||||
-f, --fetchusers
|
||||
*Currently experimental*: if this option is set, the script will try and
|
||||
fetch users’ profiles infos from each considered instance’s user directory
|
||||
and store them in the database.
|
||||
-t, --timeout <seconds>
|
||||
Sets the timeout in seconds for every connection attempt.
|
||||
DEFAULT: «'.$opts['timeout'].'»
|
||||
|
@ -133,6 +138,10 @@ for ($i=1; $i<$argc; $i++) {
|
|||
$i++;
|
||||
$opts['peersfp']=$argv[$i];
|
||||
break;
|
||||
case '-f':
|
||||
case '--fetchusers':
|
||||
$opts['fetchusers']=true;
|
||||
break;
|
||||
case '-t':
|
||||
case '--timeout':
|
||||
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
|
||||
|
@ -682,6 +691,65 @@ while ($i<$cinsts) {
|
|||
} else {
|
||||
eecho(2,'could not fetch instance trends from API: '.$buf['emsg'].N);
|
||||
}
|
||||
if ($opts['fetchusers']) {
|
||||
$res=mysqli_query($link,'SELECT ID, locid FROM Users WHERE host=\''.myesc($link,$host).'\'')
|
||||
or mexit(__LINE__.': '.mysqli_error($link).N,3);
|
||||
$uids=[];
|
||||
while ($row=mysqli_fetch_assoc($res)) $uids[$row['locid']]=$row['ID'];
|
||||
print_r($uids);
|
||||
eecho(0,'trying to fetch users info from directory API...'.N);
|
||||
$chunk=0;
|
||||
$limit=80;
|
||||
$end=false;
|
||||
while (!$end) {
|
||||
$offset=$chunk*$limit;
|
||||
$buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']);
|
||||
if ($buf['cont']!==false) {
|
||||
ckratelimit($buf['headers']);
|
||||
eecho(1,'got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N);
|
||||
$buf=@json_decode($buf['cont'],true);
|
||||
if (is_array($buf)) {
|
||||
//print_r($buf);
|
||||
if (count($buf)<$limit) $end=true;
|
||||
//foreach ($buf as $user) echo($user['username'].' '); echo(N.N);
|
||||
foreach ($buf as $user) {
|
||||
if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields'], $user)) {
|
||||
if (!isset($user['noindex'])) $user['noindex']=false;
|
||||
if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']);
|
||||
if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']);
|
||||
$query='host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', noindex='.myv($link,$user['noindex']);
|
||||
$do=true;
|
||||
$ures=mysqli_query($link,'SELECT * FROM Users WHERE host=\''.myesc($link,$host).'\' AND locid=\''.myesc($link,$user['id']).'\'')
|
||||
or mexit(__LINE__.': '.mysqli_error($link).N,3);
|
||||
$nr=mysqli_num_rows($ures);
|
||||
if ($nr==0) {
|
||||
$query='INSERT INTO Users SET '.$query;
|
||||
} elseif ($nr==1) {
|
||||
$urow=mysqli_fetch_assoc($ures);
|
||||
//print_r($urow);
|
||||
$query='UPDATE Users SET '.$query.' WHERE ID='.$urow['ID'];
|
||||
} else {
|
||||
$do=false;
|
||||
notify('Table Users contains more than one record with locid='.$user['id'].' and host='.$host,3);
|
||||
}
|
||||
if ($do && !$opts['dryrun']) mysqli_query($link,$query)
|
||||
or mexit(__LINE__.': '.mysqli_error($link).N,3);
|
||||
} else {
|
||||
eecho(2,'user record missed some required keys :-('.N);
|
||||
//print_r($user);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
eecho(2,'... but the chunk was not good JSON :-('.N);
|
||||
$end=true;
|
||||
}
|
||||
$chunk++;
|
||||
} else {
|
||||
eecho(2,'could not fetch users info from directory API: '.$buf['emsg'].N);
|
||||
$end=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -1062,4 +1130,57 @@ eecho(1,'Done :-)'.N);
|
|||
|
||||
exit(0);
|
||||
|
||||
// "multi array_key_exists"
|
||||
function make($keys,&$arr) {
|
||||
foreach ($keys as $key)
|
||||
if (!array_key_exists($key,$arr))
|
||||
return(false);
|
||||
return(true);
|
||||
}
|
||||
|
||||
function myv(&$link,$var) {
|
||||
if (is_null($var)) {
|
||||
return('NULL');
|
||||
} elseif (is_bool($var)) {
|
||||
if ($var)
|
||||
return('1');
|
||||
else
|
||||
return('0');
|
||||
} elseif (trim($var)=='') {
|
||||
return('NULL');
|
||||
} else {
|
||||
return('\''.mysqli_real_escape_string($link,$var).'\'');
|
||||
}
|
||||
}
|
||||
|
||||
function datetomy($date) {
|
||||
$date=explode('-',$date);
|
||||
return(mktime(0,0,0,$date[1],$date[2],$date[0]));
|
||||
}
|
||||
|
||||
function ckratelimit($httpresphead,$verbose=false) {
|
||||
$headers=explode("\r\n",$httpresphead);
|
||||
$buff=[];
|
||||
array_shift($headers);
|
||||
foreach ($headers as $header)
|
||||
if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1)
|
||||
$buff[$matches[1]]=trim($matches[2]);
|
||||
$headers=$buff;
|
||||
//print_r($headers);
|
||||
if (array_key_exists('x-ratelimit-reset',$headers)) {
|
||||
//Wed, 30 Mar 2022 21:27:22 GMT
|
||||
$srvnow=strtotime($headers['date']);
|
||||
//2022-03-31T04:05:00.058705Z
|
||||
$srvrlr=strtotime($headers['x-ratelimit-reset']);
|
||||
$stosl=$srvrlr-$srvnow+1;
|
||||
if ($verbose) eecho(0,'ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N);
|
||||
if ($headers['x-ratelimit-remaining']<3) {
|
||||
eecho(2,'reached rate limit, sleeping for '.$stosl.' seconds ...'.N);
|
||||
sleep($stosl);
|
||||
}
|
||||
} elseif ($verbose) {
|
||||
eecho(2,'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!'.N);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
|
|
|
@ -9,7 +9,7 @@ elif [ ! -d "$1" ]; then
|
|||
fi
|
||||
|
||||
cd "$1"
|
||||
MAILCFG="../../mail.conf"
|
||||
MAILCFG="../conf/mail.conf"
|
||||
LOGF="crawl.log"
|
||||
MAILF="mail"
|
||||
|
||||
|
@ -26,7 +26,7 @@ function domail {
|
|||
function logcmd {
|
||||
init="$(date)"
|
||||
echo "$init: eseguo «$1»" >> $LOGF
|
||||
$1 > $2
|
||||
$1 &> $2
|
||||
ec=$?
|
||||
endt="$(date)"
|
||||
echo -n "$endt: fine esecuzione di «$1»: " >> $LOGF
|
||||
|
@ -50,9 +50,9 @@ logcmd "php mustool.php updstats" "mustool.updstats.log"
|
|||
|
||||
if [ "$2" == "dopeers" ]; then
|
||||
logcmd "php peerscrawl.php -e peerscrawl.exclude -E" "peerscrawl.log"
|
||||
logcmd "php crawler.php -p peers -t 8" "crawler.log"
|
||||
logcmd "php crawler.php -p peers -t 10" "crawler.log"
|
||||
else
|
||||
logcmd "php crawler.php -t 8" "crawler.log"
|
||||
logcmd "php crawler.php -t 10" "crawler.log"
|
||||
fi
|
||||
|
||||
logcmd "php mustool.php shuffle clean optimize" "mustool.shuffle-clean-optimize.log"
|
||||
|
|
|
@ -254,6 +254,7 @@ if ($opts['excludedead']) {
|
|||
while ($row=mysqli_fetch_assoc($res))
|
||||
$deadinsts[]=$row['URI'];
|
||||
unset($res);
|
||||
gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false);
|
||||
}
|
||||
|
||||
/*$contextopts=array(
|
||||
|
@ -403,7 +404,7 @@ function crawl($list,$id) {
|
|||
} elseif (!validhostname($peer)) {
|
||||
gecho(' ERROR: I won’t add «'.$peer.'» to next round list because it’s not a valid hostname.'.N,true,true);
|
||||
} elseif (ckexarr($peer)) {
|
||||
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,true);
|
||||
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,false);
|
||||
} elseif (in_array($peer,$ainsts)) {
|
||||
if ($opts['verbose'])
|
||||
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $ainsts.'.N,true,false);
|
||||
|
@ -420,8 +421,7 @@ function crawl($list,$id) {
|
|||
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $insts.'.N,true,false);
|
||||
}*/
|
||||
} elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) {
|
||||
if ($opts['verbose'])
|
||||
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s DEAD.'.N,true,false);
|
||||
gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s DEAD.'.N,true,false);
|
||||
} else {
|
||||
//EVVIVA!
|
||||
gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false);
|
||||
|
|
|
@ -11,7 +11,11 @@ function getfc($url,$timeout,$headers=[]) {
|
|||
curl_setopt($curli,CURLOPT_TIMEOUT,$timeout);
|
||||
curl_setopt($curli,CURLOPT_CONNECTTIMEOUT,$timeout);
|
||||
curl_setopt($curli,CURLOPT_HTTPHEADER,$headers);
|
||||
curl_setopt($curli,CURLOPT_HEADER,true);
|
||||
$cont=curl_exec($curli);
|
||||
$headers_sz=curl_getinfo($curli,CURLINFO_HEADER_SIZE);
|
||||
$headers=substr($cont,0,$headers_sz);
|
||||
$cont=substr($cont,$headers_sz);
|
||||
$emsg=false;
|
||||
$cerrno=curl_errno($curli);
|
||||
if ($cerrno) {
|
||||
|
@ -24,7 +28,7 @@ function getfc($url,$timeout,$headers=[]) {
|
|||
$emsg='unknown';
|
||||
}
|
||||
curl_close($curli);
|
||||
return(array('cont'=>$cont,'emsg'=>$emsg));
|
||||
return(['cont'=>$cont,'headers'=>$headers,'emsg'=>$emsg]);
|
||||
}
|
||||
|
||||
?>
|
||||
|
|
Loading…
Reference in a new issue