Adding “fetchuser” option; changed “timeout” default from 5 to 10 seconds

This commit is contained in:
pezcurrel 2022-12-08 00:03:10 +01:00
parent e8e3fa5872
commit c3a90ba6b8

View file

@ -63,8 +63,8 @@ if (function_exists('pcntl_signal')) {
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
}
$opts=array(
'timeout'=>5,
$opts=[
'timeout'=>10,
'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead
'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new
'ldtoots'=>40,// number of toots to check with the automatic language detection function
@ -75,8 +75,9 @@ $opts=array(
'peersfp'=>null,
'dontrestore'=>false,
'ignorelock'=>false,
'fetchusers'=>false,
'moreclauses'=>''
);
];
$help='crawler.php
DESCRIPTION
@ -92,6 +93,10 @@ $help='crawler.php
which are already present in the database.
Note that this option is ignored if the script will recover a previous
unfinished session.
-f, --fetchusers
*Currently experimental*: if this option is set, the script will try and
fetch users profiles infos from each considered instances user directory
and store them in the database.
-t, --timeout <seconds>
Sets the timeout in seconds for every connection attempt.
DEFAULT: «'.$opts['timeout'].'»
@ -133,6 +138,10 @@ for ($i=1; $i<$argc; $i++) {
$i++;
$opts['peersfp']=$argv[$i];
break;
case '-f':
case '--fetchusers':
$opts['fetchusers']=true;
break;
case '-t':
case '--timeout':
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
@ -682,6 +691,65 @@ while ($i<$cinsts) {
} else {
eecho(2,'could not fetch instance trends from API: '.$buf['emsg'].N);
}
if ($opts['fetchusers']) {
$res=mysqli_query($link,'SELECT ID, locid FROM Users WHERE host=\''.myesc($link,$host).'\'')
or mexit(__LINE__.': '.mysqli_error($link).N,3);
$uids=[];
while ($row=mysqli_fetch_assoc($res)) $uids[$row['locid']]=$row['ID'];
print_r($uids);
eecho(0,'trying to fetch users info from directory API...'.N);
$chunk=0;
$limit=80;
$end=false;
while (!$end) {
$offset=$chunk*$limit;
$buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']);
if ($buf['cont']!==false) {
ckratelimit($buf['headers']);
eecho(1,'got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N);
$buf=@json_decode($buf['cont'],true);
if (is_array($buf)) {
//print_r($buf);
if (count($buf)<$limit) $end=true;
//foreach ($buf as $user) echo($user['username'].' '); echo(N.N);
foreach ($buf as $user) {
if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields'], $user)) {
if (!isset($user['noindex'])) $user['noindex']=false;
if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']);
if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']);
$query='host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', noindex='.myv($link,$user['noindex']);
$do=true;
$ures=mysqli_query($link,'SELECT * FROM Users WHERE host=\''.myesc($link,$host).'\' AND locid=\''.myesc($link,$user['id']).'\'')
or mexit(__LINE__.': '.mysqli_error($link).N,3);
$nr=mysqli_num_rows($ures);
if ($nr==0) {
$query='INSERT INTO Users SET '.$query;
} elseif ($nr==1) {
$urow=mysqli_fetch_assoc($ures);
//print_r($urow);
$query='UPDATE Users SET '.$query.' WHERE ID='.$urow['ID'];
} else {
$do=false;
notify('Table Users contains more than one record with locid='.$user['id'].' and host='.$host,3);
}
if ($do && !$opts['dryrun']) mysqli_query($link,$query)
or mexit(__LINE__.': '.mysqli_error($link).N,3);
} else {
eecho(2,'user record missed some required keys :-('.N);
//print_r($user);
}
}
} else {
eecho(2,'... but the chunk was not good JSON :-('.N);
$end=true;
}
$chunk++;
} else {
eecho(2,'could not fetch users info from directory API: '.$buf['emsg'].N);
$end=true;
}
}
}
}
}
} else {
@ -1062,4 +1130,57 @@ eecho(1,'Done :-)'.N);
exit(0);
// "multi array_key_exists"
function make($keys,&$arr) {
foreach ($keys as $key)
if (!array_key_exists($key,$arr))
return(false);
return(true);
}
function myv(&$link,$var) {
if (is_null($var)) {
return('NULL');
} elseif (is_bool($var)) {
if ($var)
return('1');
else
return('0');
} elseif (trim($var)=='') {
return('NULL');
} else {
return('\''.mysqli_real_escape_string($link,$var).'\'');
}
}
function datetomy($date) {
$date=explode('-',$date);
return(mktime(0,0,0,$date[1],$date[2],$date[0]));
}
function ckratelimit($httpresphead,$verbose=false) {
$headers=explode("\r\n",$httpresphead);
$buff=[];
array_shift($headers);
foreach ($headers as $header)
if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1)
$buff[$matches[1]]=trim($matches[2]);
$headers=$buff;
//print_r($headers);
if (array_key_exists('x-ratelimit-reset',$headers)) {
//Wed, 30 Mar 2022 21:27:22 GMT
$srvnow=strtotime($headers['date']);
//2022-03-31T04:05:00.058705Z
$srvrlr=strtotime($headers['x-ratelimit-reset']);
$stosl=$srvrlr-$srvnow+1;
if ($verbose) eecho(0,'ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N);
if ($headers['x-ratelimit-remaining']<3) {
eecho(2,'reached rate limit, sleeping for '.$stosl.' seconds ...'.N);
sleep($stosl);
}
} elseif ($verbose) {
eecho(2,'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!'.N);
}
}
?>