Adding “fetchuser” option; changed “timeout” default from 5 to 10 seconds
This commit is contained in:
parent
e8e3fa5872
commit
c3a90ba6b8
1 changed files with 124 additions and 3 deletions
|
@ -63,8 +63,8 @@ if (function_exists('pcntl_signal')) {
|
|||
pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
|
||||
}
|
||||
|
||||
$opts=array(
|
||||
'timeout'=>5,
|
||||
$opts=[
|
||||
'timeout'=>10,
|
||||
'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead
|
||||
'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new
|
||||
'ldtoots'=>40,// number of toots to check with the automatic language detection function
|
||||
|
@ -75,8 +75,9 @@ $opts=array(
|
|||
'peersfp'=>null,
|
||||
'dontrestore'=>false,
|
||||
'ignorelock'=>false,
|
||||
'fetchusers'=>false,
|
||||
'moreclauses'=>''
|
||||
);
|
||||
];
|
||||
|
||||
$help='crawler.php
|
||||
DESCRIPTION
|
||||
|
@ -92,6 +93,10 @@ $help='crawler.php
|
|||
which are already present in the database.
|
||||
Note that this option is ignored if the script will recover a previous
|
||||
unfinished session.
|
||||
-f, --fetchusers
|
||||
*Currently experimental*: if this option is set, the script will try and
|
||||
fetch users’ profiles infos from each considered instance’s user directory
|
||||
and store them in the database.
|
||||
-t, --timeout <seconds>
|
||||
Sets the timeout in seconds for every connection attempt.
|
||||
DEFAULT: «'.$opts['timeout'].'»
|
||||
|
@ -133,6 +138,10 @@ for ($i=1; $i<$argc; $i++) {
|
|||
$i++;
|
||||
$opts['peersfp']=$argv[$i];
|
||||
break;
|
||||
case '-f':
|
||||
case '--fetchusers':
|
||||
$opts['fetchusers']=true;
|
||||
break;
|
||||
case '-t':
|
||||
case '--timeout':
|
||||
if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1)
|
||||
|
@ -682,6 +691,65 @@ while ($i<$cinsts) {
|
|||
} else {
|
||||
eecho(2,'could not fetch instance trends from API: '.$buf['emsg'].N);
|
||||
}
|
||||
if ($opts['fetchusers']) {
|
||||
$res=mysqli_query($link,'SELECT ID, locid FROM Users WHERE host=\''.myesc($link,$host).'\'')
|
||||
or mexit(__LINE__.': '.mysqli_error($link).N,3);
|
||||
$uids=[];
|
||||
while ($row=mysqli_fetch_assoc($res)) $uids[$row['locid']]=$row['ID'];
|
||||
print_r($uids);
|
||||
eecho(0,'trying to fetch users info from directory API...'.N);
|
||||
$chunk=0;
|
||||
$limit=80;
|
||||
$end=false;
|
||||
while (!$end) {
|
||||
$offset=$chunk*$limit;
|
||||
$buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']);
|
||||
if ($buf['cont']!==false) {
|
||||
ckratelimit($buf['headers']);
|
||||
eecho(1,'got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N);
|
||||
$buf=@json_decode($buf['cont'],true);
|
||||
if (is_array($buf)) {
|
||||
//print_r($buf);
|
||||
if (count($buf)<$limit) $end=true;
|
||||
//foreach ($buf as $user) echo($user['username'].' '); echo(N.N);
|
||||
foreach ($buf as $user) {
|
||||
if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields'], $user)) {
|
||||
if (!isset($user['noindex'])) $user['noindex']=false;
|
||||
if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']);
|
||||
if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']);
|
||||
$query='host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', noindex='.myv($link,$user['noindex']);
|
||||
$do=true;
|
||||
$ures=mysqli_query($link,'SELECT * FROM Users WHERE host=\''.myesc($link,$host).'\' AND locid=\''.myesc($link,$user['id']).'\'')
|
||||
or mexit(__LINE__.': '.mysqli_error($link).N,3);
|
||||
$nr=mysqli_num_rows($ures);
|
||||
if ($nr==0) {
|
||||
$query='INSERT INTO Users SET '.$query;
|
||||
} elseif ($nr==1) {
|
||||
$urow=mysqli_fetch_assoc($ures);
|
||||
//print_r($urow);
|
||||
$query='UPDATE Users SET '.$query.' WHERE ID='.$urow['ID'];
|
||||
} else {
|
||||
$do=false;
|
||||
notify('Table Users contains more than one record with locid='.$user['id'].' and host='.$host,3);
|
||||
}
|
||||
if ($do && !$opts['dryrun']) mysqli_query($link,$query)
|
||||
or mexit(__LINE__.': '.mysqli_error($link).N,3);
|
||||
} else {
|
||||
eecho(2,'user record missed some required keys :-('.N);
|
||||
//print_r($user);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
eecho(2,'... but the chunk was not good JSON :-('.N);
|
||||
$end=true;
|
||||
}
|
||||
$chunk++;
|
||||
} else {
|
||||
eecho(2,'could not fetch users info from directory API: '.$buf['emsg'].N);
|
||||
$end=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -1062,4 +1130,57 @@ eecho(1,'Done :-)'.N);
|
|||
|
||||
exit(0);
|
||||
|
||||
// "multi array_key_exists"
|
||||
function make($keys,&$arr) {
|
||||
foreach ($keys as $key)
|
||||
if (!array_key_exists($key,$arr))
|
||||
return(false);
|
||||
return(true);
|
||||
}
|
||||
|
||||
function myv(&$link,$var) {
|
||||
if (is_null($var)) {
|
||||
return('NULL');
|
||||
} elseif (is_bool($var)) {
|
||||
if ($var)
|
||||
return('1');
|
||||
else
|
||||
return('0');
|
||||
} elseif (trim($var)=='') {
|
||||
return('NULL');
|
||||
} else {
|
||||
return('\''.mysqli_real_escape_string($link,$var).'\'');
|
||||
}
|
||||
}
|
||||
|
||||
function datetomy($date) {
|
||||
$date=explode('-',$date);
|
||||
return(mktime(0,0,0,$date[1],$date[2],$date[0]));
|
||||
}
|
||||
|
||||
function ckratelimit($httpresphead,$verbose=false) {
|
||||
$headers=explode("\r\n",$httpresphead);
|
||||
$buff=[];
|
||||
array_shift($headers);
|
||||
foreach ($headers as $header)
|
||||
if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1)
|
||||
$buff[$matches[1]]=trim($matches[2]);
|
||||
$headers=$buff;
|
||||
//print_r($headers);
|
||||
if (array_key_exists('x-ratelimit-reset',$headers)) {
|
||||
//Wed, 30 Mar 2022 21:27:22 GMT
|
||||
$srvnow=strtotime($headers['date']);
|
||||
//2022-03-31T04:05:00.058705Z
|
||||
$srvrlr=strtotime($headers['x-ratelimit-reset']);
|
||||
$stosl=$srvrlr-$srvnow+1;
|
||||
if ($verbose) eecho(0,'ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N);
|
||||
if ($headers['x-ratelimit-remaining']<3) {
|
||||
eecho(2,'reached rate limit, sleeping for '.$stosl.' seconds ...'.N);
|
||||
sleep($stosl);
|
||||
}
|
||||
} elseif ($verbose) {
|
||||
eecho(2,'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!'.N);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
|
|
Loading…
Reference in a new issue