From 18ce06871b04f99e7b07b615513e9ae58f18eec3 Mon Sep 17 00:00:00 2001 From: pezcurrel Date: Fri, 9 Dec 2022 22:53:18 +0100 Subject: [PATCH] Added ckratelimit() where useful; made it more flexible with lowercasing every header key; more work on fetching users from users directories --- web/clitools/crawler.php | 134 ++++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 57 deletions(-) diff --git a/web/clitools/crawler.php b/web/clitools/crawler.php index ad3e27b..e27b99e 100755 --- a/web/clitools/crawler.php +++ b/web/clitools/crawler.php @@ -400,6 +400,7 @@ function get_api($host, $path) { global $opts; $buf = @getfc('https://'.$host.$path,$opts['timeout']); if ($buf['cont']!==false) { + ckratelimit($buf['headers']); $data = json_decode($buf['cont'], true); return $data; } else { @@ -646,18 +647,20 @@ while ($i<$cinsts) { $tela=$now-$tini; eecho(1,'working on «'.$host.'»; '.$i.'/'.$cinsts.'; '.$qok.' ok; '.$qgood.' good; '.round(100/$cinsts*$i).'%; elapsed time: '.ght($tela,null,0).'; estimated remaining time: '.ght($tela/$i*($cinsts-$beg)-$tela,null,0).'; mem.: '.ghs(memory_get_usage(true)).'; mem. peak: '.ghs(memory_get_peak_usage(true)).N); if (willtrunc($host,'Instances','URI')) { - eecho(2,'ignoring «'.$host.'» because hostname is too long for the «URI» column of «Instances» table.'.N); + eecho(2,'«'.$host.'»: ignoring it because hostname is too long for the «URI» column of «Instances» table.'.N); } else { - eecho(0,'trying to fetch instance info from API...'.N); + eecho(0,'«'.$host.'»: trying to fetch instance info from API...'.N); $buf=@getfc('https://'.$host.'/api/v1/instance',$opts['timeout']); if ($buf['cont']!==false) { + ckratelimit($buf['headers']); $info=@json_decode($buf['cont'],true); if (is_array($info)) { - eecho(1,'got instance info from API :-)'.N); - eecho(0,'trying to fetch instance info from nodeinfo...'.N); + eecho(1,'«'.$host.'»: got instance info from API :-)'.N); + eecho(0,'«'.$host.'»: trying to fetch instance info from nodeinfo...'.N); $buf=@getfc('https://'.$host.'/nodeinfo/2.0.json',$opts['timeout']); if ($buf['cont']!==false) { - eecho(1,'got instance info from nodeinfo :-)'.N); + //ckratelimit($buf['headers']);// no ckratelimit here because nodeinfo doesn't use it + eecho(1,'«'.$host.'»: got instance info from nodeinfo :-)'.N); $info['x-nodeinfo']=json_decode($buf['cont'],true); // we should keep an eye to new software names here, to decide if they are mastodon derivates... if (isset($info['x-nodeinfo']['software']['name']) && !is_null($info['x-nodeinfo']['software']['name'])) { @@ -672,35 +675,36 @@ while ($i<$cinsts) { } } } else { - eecho(2,'could not fetch instance info from nodeinfo: '.$buf['emsg'].N); + eecho(2,'«'.$host.'»: could not fetch instance info from nodeinfo: '.$buf['emsg'].N); } if (array_key_exists('version',$info)) { if ($info['version']>='2.1.2') { - eecho(0,'trying to fetch instance activity info from API...'.N); + eecho(0,'«'.$host.'»: trying to fetch instance activity info from API...'.N); $buf=@getfc('https://'.$host.'/api/v1/instance/activity',$opts['timeout']); if ($buf['cont']!==false) { - eecho(1,'got instance activity info from API :-)'.N); + ckratelimit($buf['headers']); + eecho(1,'«'.$host.'»: got instance activity info from API :-)'.N); $info['x-activity']=json_decode($buf['cont'],true); } else { - eecho(2,'could not fetch instance activity from API: '.$buf['emsg'].N); + eecho(2,'«'.$host.'»: could not fetch instance activity from API: '.$buf['emsg'].N); } } if ($info['version']>='3.0.0') { - eecho(0,'trying to fetch instance trends info from API...'.N); + eecho(0,'«'.$host.'»: trying to fetch instance trends info from API...'.N); $buf=@getfc('https://'.$host.'/api/v1/trends',$opts['timeout']); if ($buf['cont']!==false) { - eecho(1,'got instance trends info from API :-)'.N); + ckratelimit($buf['headers']); + eecho(1,'«'.$host.'»: got instance trends info from API :-)'.N); $info['x-trends']=json_decode($buf['cont'],true); } else { - eecho(2,'could not fetch instance trends from API: '.$buf['emsg'].N); + eecho(2,'«'.$host.'»: could not fetch instance trends from API: '.$buf['emsg'].N); } if ($opts['fetchusers']) { - $users=[];// array of users in this instance's directory - $res=mysqli_query($link,'SELECT ID, locid, username, OptedOut FROM Users WHERE host=\''.myesc($link,$host).'\'') - or mexit(__LINE__.': '.mysqli_error($link).N,3); $exusers=[];// array of this instance's users already existing in the db + $res=mysqli_query($link,'SELECT ID, locid, username FROM Users WHERE host=\''.myesc($link,$host).'\'') or mexit(__LINE__.': '.mysqli_error($link).N,3); while ($row=mysqli_fetch_assoc($res)) $exusers[$row['locid']]=$row; - eecho(0,'trying to fetch users info from directory API...'.N); + $users=[];// array of users in this instance's directory + eecho(0,'«'.$host.'»: trying to fetch users info from directory API...'.N); $chunk=0; $limit=80; $end=false; @@ -709,7 +713,7 @@ while ($i<$cinsts) { $buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']); if ($buf['cont']!==false) { ckratelimit($buf['headers']); - eecho(1,'got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N); + eecho(1,'«'.$host.'»: got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N); $buf=@json_decode($buf['cont'],true); if (is_array($buf)) { //print_r($buf); @@ -718,67 +722,84 @@ while ($i<$cinsts) { foreach ($buf as $user) { if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields'], $user)) { if (!isset($user['noindex'])) $user['noindex']=false; + $user['tags']=[]; + if (!$user['noindex']) { + eecho(0,'«'.$host.'»: trying to fetch tags for user «'.$user['username'].'»...'.N); + $tags=@getfc('https://'.$host.'/api/v1/accounts/'.$user['id'].'/featured_tags',$opts['timeout']); + if ($tags['cont']!==false) { + ckratelimit($tags['headers']); + $tags=@json_decode($tags['cont'],true); + if (is_array($tags) && count($tags)>0) { + eecho(0,'«'.$host.'»: got '.count($tags).' tag(s) for user «'.$user['username'].'» :-)'.N); + foreach($tags as $tag) $user['tags'][]=$tag['name']; + } + } else { + eecho(2,'«'.$host.'»: could not fetch tags for user «'.$user['username'].'» :-( ('.$tags['emsg'].').'.N); + } + } + $user['tags']=implode(';',$user['tags']); + if ($user['tags']=='') $user['tags']=null; if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']); if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']); + $users[$user['id']]=$user; } else { - eecho(2,'user record missed some required keys :-('.N); + eecho(2,'«'.$host.'»: user record missed some required keys :-('.N); //print_r($user); } } } else { - eecho(2,'... but the chunk was not good JSON :-('.N); + eecho(2,'«'.$host.'»: ... but the chunk was not good JSON :-('.N); $end=true; } $chunk++; } else { - eecho(2,'could not fetch users info from directory API: '.$buf['emsg'].N); + eecho(2,'«'.$host.'»: could not fetch users info from directory API: '.$buf['emsg'].N); $end=true; } } foreach ($users as $locid=>$user) { - $query='host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', noindex='.myv($link,$user['noindex']).', OptedOut=NULL'; + $query='SET host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', tags='.myv($link,$user['tags']); + $uid=0; if (!array_key_exists($user['id'],$exusers)) { if (!$user['noindex']) { eecho(0,'«'.$host.'»: inserting new user «'.$user['username'].'»...'.N); - $query='INSERT INTO Users SET '.$query; + $query='INSERT INTO Users '.$query; + if (!$opts['dryrun']) { + mysqli_query($link,$query) or mexit(__LINE__.': '.mysqli_error($link).N,3); + $uid=mysqli_insert_id($link); + } } else { - eecho(0,'«'.$host.'»: NOT inserting new user «'.$user['username'].'» because they set noindex...'.N); + eecho(0,'«'.$host.'»: NOT inserting user «'.$user['username'].'» because they set noindex...'.N); } } else { + $uid=$exusers[$locid]['ID']; if (!$user['noindex']) { - /*$msg='«'.$host.'»: updating existing user «'.$user['username'].'» ('.$exusers[$locid]['ID'].')'; - if (!is_null($exusers[$locid]['OptedOut'])) $msg.=' (who opted back into the directory)'; - $msg.='...'; - eecho(0,$msg.N);*/ - eecho(0,'«'.$host.'»: updating existing user «'.$user['username'].'» ('.$exusers[$locid]['ID'].')...'); - $query='UPDATE Users SET '.$query.' WHERE ID='.$exusers[$locid]['ID']; + eecho(0,'«'.$host.'»: updating existing user «'.$user['username'].'» ('.$uid.')...'.N); + $query='UPDATE Users '.$query.' WHERE ID='.$uid; } else { - eecho(0,'«'.$host.'»: deleting existing user «'.$user['username'].'» ('.$exusers[$locid]['ID'].') because they set noindex...'); - $query='DELETE FROM Users WHERE ID='.$exusers[$locid]['ID']; + eecho(0,'«'.$host.'»: deleting existing user «'.$user['username'].'» ('.$uid.') because they set noindex...'.N); + $query='DELETE FROM Users WHERE ID='.$uid; + } + if (!$opts['dryrun']) { + mysqli_query($link,$query) or mexit(__LINE__.': '.mysqli_error($link).N,3); + mysqli_query($link,'DELETE FROM UsersFields WHERE UserID='.$uid) or mexit(__LINE__.': '.mysqli_error($link).N,3); } } - if (!$opts['dryrun']) mysqli_query($link,$query) - or mexit(__LINE__.': '.mysqli_error($link).N,3); - $uid=mysqli_insert_id($link); - if ($uid==0) $uid=$exusers[$locid]['ID']; - if (!$opts['dryrun']) mysqli_query($link,'DELETE FROM UsersFields WHERE UserID='.$uid) - or mexit(__LINE__.': '.mysqli_error($link).N,3); - if (!$user['noindex'] && is_array($user['fields']) && count($user['fields'])>0) { + if ($uid!=0 && !$user['noindex'] && is_array($user['fields']) && count($user['fields'])>0) { + eecho(0,'«'.$host.'»: saving user fields for user «'.$user['username'].'» ('.$uid.')...'.N); foreach ($user['fields'] as $field) { (is_null($field['verified_at'])) ? $field['verified_at']=0 : $field['verified_at']=1; - if (!$opts['dryrun']) mysqli_query($link,'INSERT INTO UsersFields SET UserID='.$uid.', name='.myv($link,$field['name']).', value='.myv($link,$field['value']).', verified='.$field['verified_at']) - or mexit(__LINE__.': '.mysqli_error($link).N,3); + if (!$opts['dryrun']) mysqli_query($link,'INSERT INTO UsersFields SET UserID='.$uid.', name='.myv($link,$field['name']).', value='.myv($link,$field['value']).', verified='.$field['verified_at']) or mexit(__LINE__.': '.mysqli_error($link).N,3); } } } foreach ($exusers as $locid=>$exuser) { if (!array_key_exists($locid,$users)) { - /*eecho(0,'«'.$host.'»: user «'.$exusers[$locid]['username'].'» opted out of the directory, updating their record ('.$exuser['ID'].')...'.N); - $query='UPDATE Users SET OptedOut='.$now.' WHERE ID='.$exuser['ID'];*/ eecho(0,'«'.$host.'»: user «'.$exusers[$locid]['username'].'» opted out of the directory, deleting their record ('.$exuser['ID'].')...'.N); - $query='DELETE FROM Users WHERE ID='.$exuser['ID']; - if (!$opts['dryrun']) mysqli_query($link,$query) - or mexit(__LINE__.': '.mysqli_error($link).N,3); + if (!$opts['dryrun']) { + mysqli_query($link,'DELETE FROM Users WHERE ID='.$exuser['ID']) or mexit(__LINE__.': '.mysqli_error($link).N,3); + mysqli_query($link,'DELETE FROM UsersFields WHERE UserID='.$exuser['ID']) or mexit(__LINE__.': '.mysqli_error($link).N,3); + } } } } @@ -786,11 +807,11 @@ while ($i<$cinsts) { } } else { $instans=false; - eecho(2,'fetched data were not good JSON.'.N); + eecho(2,'«'.$host.'»: fetched data were not good JSON.'.N); } } else { $instans=false; - eecho(2,'could not fetch instance info from API: '.$buf['emsg'].N); + eecho(2,'«'.$host.'»: could not fetch instance info from API: '.$buf['emsg'].N); } if (!isset($info['uri']) || preg_match('#^\s*$#',$info['uri'])===1) $instans=false; @@ -805,7 +826,7 @@ while ($i<$cinsts) { or mexit(__LINE__.': '.mysqli_error($link).N,3); $nrows=mysqli_num_rows($res); if ($nrows==1) { - eecho(1,'«'.$host.'» didn’t respond, but it is present in the database; updating InstChecks, Instances.LastCheckOk and possibly Instances.New and Instances.Dead.'.N); + eecho(1,'«'.$host.'»: didn’t respond, but it is present in the database; updating InstChecks, Instances.LastCheckOk and possibly Instances.New and Instances.Dead.'.N); $row=mysqli_fetch_assoc($res); if (!$opts['dryrun']) mysqli_query($link,'INSERT INTO InstChecks (InstID, Time, Status) VALUES ('.$row['ID'].', '.$now.', 0)') or mexit(__LINE__.': '.mysqli_error($link).N,3); @@ -831,12 +852,12 @@ while ($i<$cinsts) { notify('Instance «'.$row['URI'].'» is dead!',2); } } else { - eecho(2,'«'.$host.'» exists in the database but there’s no data about it in InstChecks! I’ll remedy.'.N); + eecho(2,'«'.$host.'»: exists in the database but there’s no data about it in InstChecks! I’ll remedy.'.N); if (!$opts['dryrun']) mysqli_query($link,'INSERT INTO InstChecks SET InstID='.$row['ID'].', Time='.$now.', Status=0') or mexit(__LINE__.': '.mysqli_error($link).N,3); } } elseif ($nrows==0) { - eecho(1,'«'.$host.'» doesn’t respond and is not in the database, adding it.'.N); + eecho(1,'«'.$host.'»: doesn’t respond and is not in the database, adding it.'.N); // "New=0" and "FirstSeen=NULL" because it's not new and not seen until it responds for the first time if (!$opts['dryrun']) { mysqli_query($link,'INSERT INTO Instances SET FirstSeen=NULL, New=0, Good=0, Chosen=0, Visible=0, Noxious=0, URI=\''.myesc($link,$host).'\', LastCheckOk=0') or mexit(__LINE__.': '.mysqli_error($link).N,3); @@ -950,10 +971,10 @@ while ($i<$cinsts) { } if (count($whynot)==0) { $instrow['Good']=1; - eecho(1,'this is a suitable instance! :-)'.N); + eecho(1,'«'.$host.'»: this is a suitable instance! :-)'.N); $qgood++; } else { - eecho(1,'This is not a suitable instance: '.implode('; ',$whynot).' :-('.N); + eecho(1,'«'.$host.'»: this is not a suitable instance: '.implode('; ',$whynot).' :-('.N); } $res=mysqli_query($link,'SELECT * FROM Instances WHERE URI=\''.myesc($link,$instrow['URI']).'\'') @@ -961,7 +982,7 @@ while ($i<$cinsts) { $nrows=mysqli_num_rows($res); if ($nrows==1) { - eecho(1,'«'.$instrow['URI'].'» is already present in the database, updating it...'.N); + eecho(1,'«'.$instrow['URI'].'»: is already present in the database, updating it...'.N); $oldinstrow=mysqli_fetch_assoc($res); $instid=$oldinstrow['ID']; $instrow['ID']=$oldinstrow['ID']; @@ -1009,7 +1030,7 @@ while ($i<$cinsts) { $query.=$field.'=NULL, '; } $query=substr($query,0,-2).' WHERE Instances.ID='.$instrow['ID']; - eecho(1,'Update query: «'.$query.'».'.N); + eecho(1,'«'.$host.'»: update query: «'.$query.'».'.N); if (!$opts['dryrun']) mysqli_query($link,$query) or mexit(__LINE__.': '.mysqli_error($link).N,3); @@ -1064,7 +1085,7 @@ while ($i<$cinsts) { } $values=substr($values,0,-2); $query='INSERT INTO Instances ('.implode(', ',$fields).') VALUES ('.$values.')'; - eecho(1,'Insert query: «'.$query.'»'.N); + eecho(1,'«'.$host.'»: insert query: «'.$query.'»'.N); if (!$opts['dryrun']) { mysqli_query($link,$query) or mexit(__LINE__.': '.mysqli_error($link).N,3); $instid=mysqli_insert_id($link); @@ -1196,9 +1217,8 @@ function ckratelimit($httpresphead,$verbose=false) { array_shift($headers); foreach ($headers as $header) if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1) - $buff[$matches[1]]=trim($matches[2]); + $buff[strtolower($matches[1])]=trim($matches[2]); $headers=$buff; - //print_r($headers); if (array_key_exists('x-ratelimit-reset',$headers)) { //Wed, 30 Mar 2022 21:27:22 GMT $srvnow=strtotime($headers['date']);