diff --git a/web/clitools/crawler.php b/web/clitools/crawler.php index ab1a25b..53904cb 100755 --- a/web/clitools/crawler.php +++ b/web/clitools/crawler.php @@ -63,8 +63,8 @@ if (function_exists('pcntl_signal')) { pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed) } -$opts=array( - 'timeout'=>5, +$opts=[ + 'timeout'=>10, 'deadline'=>60*24*60*60,// if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead 'oldline'=>30*24*60*60,// if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new 'ldtoots'=>40,// number of toots to check with the automatic language detection function @@ -75,8 +75,9 @@ $opts=array( 'peersfp'=>null, 'dontrestore'=>false, 'ignorelock'=>false, + 'fetchusers'=>false, 'moreclauses'=>'' -); +]; $help='crawler.php DESCRIPTION @@ -92,6 +93,10 @@ $help='crawler.php which are already present in the database. Note that this option is ignored if the script will recover a previous unfinished session. + -f, --fetchusers + *Currently experimental*: if this option is set, the script will try and + fetch users’ profiles infos from each considered instance’s user directory + and store them in the database. -t, --timeout Sets the timeout in seconds for every connection attempt. DEFAULT: «'.$opts['timeout'].'» @@ -133,6 +138,10 @@ for ($i=1; $i<$argc; $i++) { $i++; $opts['peersfp']=$argv[$i]; break; + case '-f': + case '--fetchusers': + $opts['fetchusers']=true; + break; case '-t': case '--timeout': if ($i+1>=$argc || preg_match('/^[0-9]+$/',$argv[$i+1])!==1) @@ -682,6 +691,65 @@ while ($i<$cinsts) { } else { eecho(2,'could not fetch instance trends from API: '.$buf['emsg'].N); } + if ($opts['fetchusers']) { + $res=mysqli_query($link,'SELECT ID, locid FROM Users WHERE host=\''.myesc($link,$host).'\'') + or mexit(__LINE__.': '.mysqli_error($link).N,3); + $uids=[]; + while ($row=mysqli_fetch_assoc($res)) $uids[$row['locid']]=$row['ID']; + print_r($uids); + eecho(0,'trying to fetch users info from directory API...'.N); + $chunk=0; + $limit=80; + $end=false; + while (!$end) { + $offset=$chunk*$limit; + $buf=@getfc('https://'.$host.'/api/v1/directory?local=1&order=new&limit='.$limit.'&offset='.$offset,$opts['timeout']); + if ($buf['cont']!==false) { + ckratelimit($buf['headers']); + eecho(1,'got '.($chunk+1).' chunk(s) of users info from directory API :-)'.N); + $buf=@json_decode($buf['cont'],true); + if (is_array($buf)) { + //print_r($buf); + if (count($buf)<$limit) $end=true; + //foreach ($buf as $user) echo($user['username'].' '); echo(N.N); + foreach ($buf as $user) { + if (make(['id', 'username', 'display_name', 'locked', 'bot', 'discoverable', 'created_at', 'note', 'url', 'avatar', 'header', 'statuses_count', 'last_status_at', 'fields'], $user)) { + if (!isset($user['noindex'])) $user['noindex']=false; + if (!is_null($user['created_at'])) $user['created_at']=pgdatetomy($user['created_at']); + if (!is_null($user['last_status_at'])) $user['last_status_at']=datetomy($user['last_status_at']); + $query='host='.myv($link,$host).', locid='.myv($link,$user['id']).', username='.myv($link,$user['username']).', display_name='.myv($link,$user['display_name']).', locked='.myv($link,$user['locked']).', bot='.myv($link,$user['bot']).', discoverable='.myv($link,$user['discoverable']).', created_at='.myv($link,$user['created_at']).', note='.myv($link,$user['note']).', url='.myv($link,$user['url']).', avatar='.myv($link,$user['avatar']).', header='.myv($link,$user['header']).', statuses_count='.myv($link,$user['statuses_count']).', last_status_at='.myv($link,$user['last_status_at']).', noindex='.myv($link,$user['noindex']); + $do=true; + $ures=mysqli_query($link,'SELECT * FROM Users WHERE host=\''.myesc($link,$host).'\' AND locid=\''.myesc($link,$user['id']).'\'') + or mexit(__LINE__.': '.mysqli_error($link).N,3); + $nr=mysqli_num_rows($ures); + if ($nr==0) { + $query='INSERT INTO Users SET '.$query; + } elseif ($nr==1) { + $urow=mysqli_fetch_assoc($ures); + //print_r($urow); + $query='UPDATE Users SET '.$query.' WHERE ID='.$urow['ID']; + } else { + $do=false; + notify('Table Users contains more than one record with locid='.$user['id'].' and host='.$host,3); + } + if ($do && !$opts['dryrun']) mysqli_query($link,$query) + or mexit(__LINE__.': '.mysqli_error($link).N,3); + } else { + eecho(2,'user record missed some required keys :-('.N); + //print_r($user); + } + } + } else { + eecho(2,'... but the chunk was not good JSON :-('.N); + $end=true; + } + $chunk++; + } else { + eecho(2,'could not fetch users info from directory API: '.$buf['emsg'].N); + $end=true; + } + } + } } } } else { @@ -1062,4 +1130,57 @@ eecho(1,'Done :-)'.N); exit(0); +// "multi array_key_exists" +function make($keys,&$arr) { + foreach ($keys as $key) + if (!array_key_exists($key,$arr)) + return(false); + return(true); +} + +function myv(&$link,$var) { + if (is_null($var)) { + return('NULL'); + } elseif (is_bool($var)) { + if ($var) + return('1'); + else + return('0'); + } elseif (trim($var)=='') { + return('NULL'); + } else { + return('\''.mysqli_real_escape_string($link,$var).'\''); + } +} + +function datetomy($date) { + $date=explode('-',$date); + return(mktime(0,0,0,$date[1],$date[2],$date[0])); +} + +function ckratelimit($httpresphead,$verbose=false) { + $headers=explode("\r\n",$httpresphead); + $buff=[]; + array_shift($headers); + foreach ($headers as $header) + if (preg_match('/^([^:]+):(.*)$/Uu',$header,$matches)===1) + $buff[$matches[1]]=trim($matches[2]); + $headers=$buff; + //print_r($headers); + if (array_key_exists('x-ratelimit-reset',$headers)) { + //Wed, 30 Mar 2022 21:27:22 GMT + $srvnow=strtotime($headers['date']); + //2022-03-31T04:05:00.058705Z + $srvrlr=strtotime($headers['x-ratelimit-reset']); + $stosl=$srvrlr-$srvnow+1; + if ($verbose) eecho(0,'ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N); + if ($headers['x-ratelimit-remaining']<3) { + eecho(2,'reached rate limit, sleeping for '.$stosl.' seconds ...'.N); + sleep($stosl); + } + } elseif ($verbose) { + eecho(2,'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!'.N); + } +} + ?> diff --git a/web/clitools/maintain/crawl.bash b/web/clitools/maintain/crawl.bash index b451604..1b9141e 100755 --- a/web/clitools/maintain/crawl.bash +++ b/web/clitools/maintain/crawl.bash @@ -9,7 +9,7 @@ elif [ ! -d "$1" ]; then fi cd "$1" -MAILCFG="../../mail.conf" +MAILCFG="../conf/mail.conf" LOGF="crawl.log" MAILF="mail" @@ -26,7 +26,7 @@ function domail { function logcmd { init="$(date)" echo "$init: eseguo «$1»" >> $LOGF - $1 > $2 + $1 &> $2 ec=$? endt="$(date)" echo -n "$endt: fine esecuzione di «$1»: " >> $LOGF @@ -50,9 +50,9 @@ logcmd "php mustool.php updstats" "mustool.updstats.log" if [ "$2" == "dopeers" ]; then logcmd "php peerscrawl.php -e peerscrawl.exclude -E" "peerscrawl.log" - logcmd "php crawler.php -p peers -t 8" "crawler.log" + logcmd "php crawler.php -p peers -t 10" "crawler.log" else - logcmd "php crawler.php -t 8" "crawler.log" + logcmd "php crawler.php -t 10" "crawler.log" fi logcmd "php mustool.php shuffle clean optimize" "mustool.shuffle-clean-optimize.log" diff --git a/web/clitools/peerscrawl.php b/web/clitools/peerscrawl.php index 6a54014..7d5155c 100755 --- a/web/clitools/peerscrawl.php +++ b/web/clitools/peerscrawl.php @@ -254,6 +254,7 @@ if ($opts['excludedead']) { while ($row=mysqli_fetch_assoc($res)) $deadinsts[]=$row['URI']; unset($res); + gecho('Loaded list of dead instances ('.count($deadinsts).').'.N,true,false); } /*$contextopts=array( @@ -403,7 +404,7 @@ function crawl($list,$id) { } elseif (!validhostname($peer)) { gecho(' ERROR: I won’t add «'.$peer.'» to next round list because it’s not a valid hostname.'.N,true,true); } elseif (ckexarr($peer)) { - gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,true); + gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because its name matches with an exclusion regex.'.N,true,false); } elseif (in_array($peer,$ainsts)) { if ($opts['verbose']) gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $ainsts.'.N,true,false); @@ -420,8 +421,7 @@ function crawl($list,$id) { gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s already in $insts.'.N,true,false); }*/ } elseif ($opts['excludedead'] && in_array($peer,$deadinsts)) { - if ($opts['verbose']) - gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s DEAD.'.N,true,false); + gecho(' NOTICE: I won’t add «'.$peer.'» to next round list because it’s DEAD.'.N,true,false); } else { //EVVIVA! gecho(' ADDING PEER «'.$peer.'» to next round list.'.N,true,false); diff --git a/web/site/mustard/include/getfc.php b/web/site/mustard/include/getfc.php index b11b865..17c1b76 100644 --- a/web/site/mustard/include/getfc.php +++ b/web/site/mustard/include/getfc.php @@ -11,7 +11,11 @@ function getfc($url,$timeout,$headers=[]) { curl_setopt($curli,CURLOPT_TIMEOUT,$timeout); curl_setopt($curli,CURLOPT_CONNECTTIMEOUT,$timeout); curl_setopt($curli,CURLOPT_HTTPHEADER,$headers); + curl_setopt($curli,CURLOPT_HEADER,true); $cont=curl_exec($curli); + $headers_sz=curl_getinfo($curli,CURLINFO_HEADER_SIZE); + $headers=substr($cont,0,$headers_sz); + $cont=substr($cont,$headers_sz); $emsg=false; $cerrno=curl_errno($curli); if ($cerrno) { @@ -24,7 +28,7 @@ function getfc($url,$timeout,$headers=[]) { $emsg='unknown'; } curl_close($curli); - return(array('cont'=>$cont,'emsg'=>$emsg)); + return(['cont'=>$cont,'headers'=>$headers,'emsg'=>$emsg]); } ?>