Merge branch 'main' into weblate

This commit is contained in:
pezcurrel 2022-12-08 08:30:52 +01:00
commit f856af3053
4 changed files with 136 additions and 11 deletions

View file

@ -63,8 +63,8 @@ if (function_exists('pcntl_signal')) {
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
}
$opts=array(
'timeout'=>5,
$opts=[
'timeout'=>10,
'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead
'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new
'ldtoots'=>40,// number of toots to check with the automatic language detection function
@ -75,8 +75,9 @@ $opts=array(
'peersfp'=>null,
'dontrestore'=>false,
'ignorelock'=>false,
'fetchusers'=>false,
'moreclauses'=>''
);
];
$help='crawler.php
DESCRIPTION
@ -92,6 +93,10 @@ $help='crawler.php
which are already present in the database.
Note that this option is ignored if the script will recover a previous
unfinished session.
-f, --fetchusers
*Currently experimental*: if this option is set, the script will try and
fetch users profiles infos from each considered instances user directory
and store them in the database.
-t, --timeout <seconds>
Sets the timeout in seconds for every connection attempt.
DEFAULT: «'.$opts['timeout'].'»
@ -133,6 +138,10 @@ for ($i=1; $i<$argc; $i++) {
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-f':
case '--fetchusers':
$opts['fetchusers']=true;
break;
case '-t':
case '--timeout':
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
@ -682,6 +691,65 @@ while ($i<$cinsts) {
} else {
eecho(2,'could not fetch instance trends from API: '.$buf['emsg'].N);
}
if ($opts['fetchusers']) {
$res=mysqli_query($link,'SELECT ID, locid FROM Users WHERE host=\''.myesc($link,$host).'\'')
or mexit(__LINE__.': '.mysqli_error($link).N,3);
$uids=[];
while ($row=mysqli_fetch_assoc($res)) $uids[$row['locid']]=$row['ID'];
print_r($uids);
eecho(0,'trying to fetch users info from directory API...'.N);
$chunk=0;
$limit=80;
$end=false;
while (!$end) {
$offset=$chunk*$limit;
$buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
eecho(1,'got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N);
$buf=@json_decode($buf['cont'],true);
if (is_array($buf)) {
//print_r($buf);
if (count($buf)<$limit) $end=true;
//foreach ($buf as $user) echo($user['username'].' '); echo(N.N);
foreach ($buf as $user) {
if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields'], $user)) {
if (!isset($user['noindex'])) $user['noindex']=false;
if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']);
if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']);
$query='host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', noindex='.myv($link,$user['noindex']);
$do=true;
$ures=mysqli_query($link,'SELECT * FROM Users WHERE host=\''.myesc($link,$host).'\' AND locid=\''.myesc($link,$user['id']).'\'')
or mexit(__LINE__.': '.mysqli_error($link).N,3);
$nr=mysqli_num_rows($ures);
if ($nr==0) {
$query='INSERT INTO Users SET '.$query;
} elseif ($nr==1) {
$urow=mysqli_fetch_assoc($ures);
//print_r($urow);
$query='UPDATE Users SET '.$query.' WHERE ID='.$urow['ID'];
} else {
$do=false;
notify('Table Users contains more than one record with locid='.$user['id'].' and host='.$host,3);
}
if ($do && !$opts['dryrun']) mysqli_query($link,$query)
or mexit(__LINE__.': '.mysqli_error($link).N,3);
} else {
eecho(2,'user record missed some required keys :-('.N);
//print_r($user);
}
}
} else {
eecho(2,'... but the chunk was not good JSON :-('.N);
$end=true;
}
$chunk++;
} else {
eecho(2,'could not fetch users info from directory API: '.$buf['emsg'].N);
$end=true;
}
}
}
}
}
} else {
@ -1062,4 +1130,57 @@ eecho(1,'Done :-)'.N);
exit(0);
// "multi array_key_exists"
function make($keys,&$arr) {
foreach ($keys as $key)
if (!array_key_exists($key,$arr))
return(false);
return(true);
}
function myv(&$link,$var) {
if (is_null($var)) {
return('NULL');
} elseif (is_bool($var)) {
if ($var)
return('1');
else
return('0');
} elseif (trim($var)=='') {
return('NULL');
} else {
return('\''.mysqli_real_escape_string($link,$var).'\'');
}
}
function datetomy($date) {
$date=explode('-',$date);
return(mktime(0,0,0,$date[1],$date[2],$date[0]));
}
function ckratelimit($httpresphead,$verbose=false) {
$headers=explode("\r\n",$httpresphead);
$buff=[];
array_shift($headers);
foreach ($headers as $header)
if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1)
$buff[$matches[1]]=trim($matches[2]);
$headers=$buff;
//print_r($headers);
if (array_key_exists('x-ratelimit-reset',$headers)) {
//Wed, 30 Mar 2022 21:27:22 GMT
$srvnow=strtotime($headers['date']);
//2022-03-31T04:05:00.058705Z
$srvrlr=strtotime($headers['x-ratelimit-reset']);
$stosl=$srvrlr-$srvnow+1;
if ($verbose) eecho(0,'ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N);
if ($headers['x-ratelimit-remaining']<3) {
eecho(2,'reached rate limit, sleeping for '.$stosl.' seconds ...'.N);
sleep($stosl);
}
} elseif ($verbose) {
eecho(2,'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!'.N);
}
}
?>

View file

@ -9,7 +9,7 @@ elif [ ! -d "$1" ]; then
fi
cd "$1"
MAILCFG="../../mail.conf"
MAILCFG="../conf/mail.conf"
LOGF="crawl.log"
MAILF="mail"
@ -26,7 +26,7 @@ function domail {
function logcmd {
init="$(date)"
echo "$init: eseguo «$1»" >> $LOGF
$1 > $2
$1 &> $2
ec=$?
endt="$(date)"
echo -n "$endt: fine esecuzione di «$1»: " >> $LOGF
@ -50,9 +50,9 @@ logcmd "php mustool.php updstats" "mustool.updstats.log"
if [ "$2" == "dopeers" ]; then
logcmd "php peerscrawl.php -e peerscrawl.exclude -E" "peerscrawl.log"
logcmd "php crawler.php -p peers -t 8" "crawler.log"
logcmd "php crawler.php -p peers -t 10" "crawler.log"
else
logcmd "php crawler.php -t 8" "crawler.log"
logcmd "php crawler.php -t 10" "crawler.log"
fi
logcmd "php mustool.php shuffle clean optimize" "mustool.shuffle-clean-optimize.log"

View file

@ -254,6 +254,7 @@ if ($opts['excludedead']) {
while ($row=mysqli_fetch_assoc($res))
$deadinsts[]=$row['URI'];
unset($res);
gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false);
}
/*$contextopts=array(
@ -403,7 +404,7 @@ function crawl($list,$id) {
} elseif (!validhostname($peer)) {
gecho(' ERROR: I wont add «'.$peer.'» to next round list because its not a valid hostname.'.N,true,true);
} elseif (ckexarr($peer)) {
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,true);
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,false);
} elseif (in_array($peer,$ainsts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $ainsts.'.N,true,false);
@ -420,8 +421,7 @@ function crawl($list,$id) {
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its already in $insts.'.N,true,false);
}*/
} elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) {
if ($opts['verbose'])
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its DEAD.'.N,true,false);
gecho(' NOTICE: I wont add «'.$peer.'» to next round list because its DEAD.'.N,true,false);
} else {
//EVVIVA!
gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false);

View file

@ -11,7 +11,11 @@ function getfc($url,$timeout,$headers=[]) {
curl_setopt($curli,CURLOPT_TIMEOUT,$timeout);
curl_setopt($curli,CURLOPT_CONNECTTIMEOUT,$timeout);
curl_setopt($curli,CURLOPT_HTTPHEADER,$headers);
curl_setopt($curli,CURLOPT_HEADER,true);
$cont=curl_exec($curli);
$headers_sz=curl_getinfo($curli,CURLINFO_HEADER_SIZE);
$headers=substr($cont,0,$headers_sz);
$cont=substr($cont,$headers_sz);
$emsg=false;
$cerrno=curl_errno($curli);
if ($cerrno) {
@ -24,7 +28,7 @@ function getfc($url,$timeout,$headers=[]) {
$emsg='unknown';
}
curl_close($curli);
return(array('cont'=>$cont,'emsg'=>$emsg));
return(['cont'=>$cont,'headers'=>$headers,'emsg'=>$emsg]);
}
?>