2020-10-13 08:21:26 +02:00
#!/usr/bin/php
< ? php
/*
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http :// www . gnu . org / licenses />.
*/
2022-11-30 07:19:14 +01:00
const N = " \n " ;
2020-10-13 08:21:26 +02:00
2022-12-16 21:59:26 +01:00
require ( __DIR__ . '/../site/mustard/include/gurl.php' );
2020-10-13 08:21:26 +02:00
2022-12-01 05:41:54 +01:00
require ( __DIR__ . '/lib/vendor/autoload.php' );
2020-10-13 08:21:26 +02:00
use LanguageDetection\Language ;
2022-12-01 05:41:54 +01:00
( strtoupper ( substr ( PHP_OS , 0 , 3 )) === 'WIN' ) ? $iswin = true : $iswin = false ;
2020-10-13 08:21:26 +02:00
2022-12-01 05:41:54 +01:00
function eecho ( $lev , $msg ) {
$time = microtime ( false );
$time = explode ( ' ' , $time );
$time = date ( 'Y-m-d H:i:s' , $time [ 1 ]) . '.' . substr ( $time [ 0 ], 2 );
$levs = [ 'Debug' , 'Info' , 'Warning' , 'Error' ];
$msg = $time . ' ' . $levs [ $lev ] . ': ' . $msg ;
if ( $lev < 2 )
echo ( $msg );
else
fwrite ( STDERR , $msg );
}
function mexit ( $msg , $code ) {
global $link , $jsonf , $lockfp ;
2022-12-11 23:29:51 +01:00
if ( isset ( $link )) mysqli_close ( $link );
if ( isset ( $jsonf )) fclose ( $jsonf );
if ( isset ( $lockfp ) && is_file ( $lockfp )) unlink ( $lockfp );
2022-12-01 05:41:54 +01:00
if ( $code != 0 )
eecho ( 3 , $msg );
else
eecho ( 1 , $msg );
exit ( $code );
}
2020-10-13 08:21:26 +02:00
declare ( ticks = 1 );
if ( function_exists ( 'pcntl_signal' )) {
function signalHandler ( $signal ) {
2022-12-01 05:41:54 +01:00
echo ( N );
mexit ( 'received signal «' . $signal . '», shutting down.' . N , 0 );
2020-10-13 08:21:26 +02:00
}
pcntl_signal ( SIGTERM , 'signalHandler' ); // Termination ('kill' was called)
pcntl_signal ( SIGHUP , 'signalHandler' ); // Terminal log-out
pcntl_signal ( SIGINT , 'signalHandler' ); // Interrupted (Ctrl-C is pressed)
}
2022-12-08 00:03:10 +01:00
$opts = [
'timeout' => 10 ,
2022-12-01 05:41:54 +01:00
'deadline' => 60 * 24 * 60 * 60 , // if an instance has not been responding for more than this value of seconds (currently 60 days), declare it dead
'oldline' => 30 * 24 * 60 * 60 , // if an instance has been new for a period longer than this amount (currently 30 days), it's no longer new
'ldtoots' => 40 , // number of toots to check with the automatic language detection function
2020-10-13 08:21:26 +02:00
'setnew' => true ,
'dryrun' => false ,
'jsonfp' => __DIR__ . '/instances.json' ,
'jsonwrite' => false ,
2020-10-14 00:03:40 +02:00
'peersfp' => null ,
2020-10-14 08:37:41 +02:00
'dontrestore' => false ,
2022-12-05 21:18:58 +01:00
'ignorelock' => false ,
2022-12-08 00:03:10 +01:00
'fetchusers' => false ,
2022-12-05 21:18:58 +01:00
'moreclauses' => ''
2022-12-08 00:03:10 +01:00
];
2020-10-13 08:21:26 +02:00
$help = ' crawler . php
2022-12-01 05:41:54 +01:00
DESCRIPTION
This script updates mastostart’ s database with the data it manages to
retrieve from instances already present in the database plus ( optionally )
those listed in a specifiable file ( typically the output file from a
peerscrawl . php run ) .
2020-10-13 08:21:26 +02:00
SYNOPSIS
2022-12-01 05:41:54 +01:00
crawler . php [ options ]
2020-10-13 08:21:26 +02:00
OPTIONS
- p , -- peersfp < file >
2022-12-01 05:41:54 +01:00
Sets a file containing a list of instances to consider in addition to those
which are already present in the database .
Note that this option is ignored if the script will recover a previous
unfinished session .
2022-12-08 00:03:10 +01:00
- f , -- fetchusers
* Currently experimental *: if this option is set , the script will try and
fetch users’ profiles infos from each considered instance’ s user directory
and store them in the database .
2022-12-01 05:41:54 +01:00
- t , -- timeout < seconds >
Sets the timeout in seconds for every connection attempt .
2020-10-13 08:21:26 +02:00
DEFAULT : « '.$opts[' timeout '].' »
- N , -- dontsetnew
2022-12-01 05:41:54 +01:00
If this option is set , the script won’ t mark new instances as new . This can
be useful for a first run .
2020-10-14 08:37:41 +02:00
- I , -- ignorelock
2022-12-01 05:41:54 +01:00
Normally , if its lockfile exists , the script will exit with an error .
If this option is set , the lockfile existence will be ignored .
Warning : check that the script is actually not running yet before using
this option .
2020-10-14 08:37:41 +02:00
- R , -- dontrestore
2022-12-01 05:41:54 +01:00
If this option is set and «instances . job» and «currinst . job» files from
a previous unfinished session are present , the script will ignore them
and start a new session .
2020-10-13 08:21:26 +02:00
- d , -- dryrun
2022-12-01 05:41:54 +01:00
If this option is set , the script won’ t write anything in the database .
2020-10-13 08:21:26 +02:00
- j , -- jsonwrite
2022-12-01 05:41:54 +01:00
If this option is set , the script will write an «instances . json» file
containing all the data it could retrieve from every considered instance .
2022-12-05 21:18:58 +01:00
- m , -- moreclauses < more SQL clauses >
If this option is set , whatever one writes as argument to the option will
be added to the main query for instances’ records , which is
«SELECT URI FROM Instances WHERE Dead = 0 » , so one can limit the crawl more .
2020-10-13 21:32:58 +02:00
- h , -- help
2022-12-01 05:41:54 +01:00
If this option is set , the script will show this help text and exit .
2020-10-13 08:21:26 +02:00
This program comes with ABSOLUTELY NO WARRANTY ; for details see the source .
This is free software , and you are welcome to redistribute it under
certain conditions ; see < http :// www . gnu . org / licenses /> for details . ' . N ;
for ( $i = 1 ; $i < $argc ; $i ++ ) {
if ( substr ( $argv [ $i ], 0 , 1 ) == '-' ) {
switch ( $argv [ $i ]) {
case '-p' :
case '--peersfp' :
if ( $i + 1 >= $argc || ! file_exists ( $argv [ $i + 1 ]) || ! is_file ( $argv [ $i + 1 ]) || ! is_readable ( $argv [ $i + 1 ]))
2022-12-01 05:41:54 +01:00
mexit ( 'option «' . $argv [ $i ] . '» requires an existing and readable file as an argument (use «-h» to read help).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$opts [ 'peersfp' ] = $argv [ $i ];
break ;
2022-12-08 00:03:10 +01:00
case '-f' :
case '--fetchusers' :
$opts [ 'fetchusers' ] = true ;
break ;
2020-10-13 08:21:26 +02:00
case '-t' :
case '--timeout' :
if ( $i + 1 >= $argc || preg_match ( '/^[0-9]+$/' , $argv [ $i + 1 ]) !== 1 )
2022-12-01 05:41:54 +01:00
mexit ( 'option «' . $argv [ $i ] . '» requires a numeric argument (use «-h» to read help).' . N , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$opts [ 'timeout' ] = $argv [ $i ] + 0 ;
break ;
case '-N' :
case '--dontsetnew' :
$opts [ 'setnew' ] = false ;
break ;
2020-10-14 08:37:41 +02:00
case '-R' :
case '--dontrestore' :
$opts [ 'dontrestore' ] = true ;
break ;
case '-I' :
case '--ignorelock' :
$opts [ 'ignorelock' ] = true ;
2020-10-14 00:03:40 +02:00
break ;
2020-10-13 08:21:26 +02:00
case '-d' :
case '--dryrun' :
$opts [ 'dryrun' ] = true ;
break ;
case '-j' :
case '--jsonwrite' :
$opts [ 'jsonwrite' ] = true ;
break ;
2022-12-05 21:18:58 +01:00
case '-m' :
case '--moreclauses' :
if ( $i + 1 >= $argc )
mexit ( 'option «' . $argv [ $i ] . '» requires some SQL clause as argument (use «-h» to read help).' . N , 1 );
$i ++ ;
$opts [ 'moreclauses' ] = $argv [ $i ];
break ;
2020-10-13 08:21:26 +02:00
case '-h' :
case '--help' :
2022-12-01 05:41:54 +01:00
echo ( $help );
exit ( 0 );
2020-10-13 08:21:26 +02:00
break ;
default :
2022-12-01 05:41:54 +01:00
mexit ( 'option «' . $argv [ $i ] . '» is unknown (use «-h» to read help).' . N , 1 );
2020-10-13 08:21:26 +02:00
break ;
}
}
}
use function mysqli_real_escape_string as myesc ;
2022-12-12 08:12:29 +01:00
function myq ( & $link , $query , $line ) {
try {
2022-12-12 08:17:01 +01:00
$res = mysqli_query ( $link , $query );
2022-12-12 08:12:29 +01:00
}
catch ( Exception $error ) {
mexit ( 'query «' . $query . '» (line ' . $line . ') failed: ' . $error -> getMessage () . N , 3 );
}
2022-12-16 19:02:41 +01:00
// for older php versions, which seem to not catch mysql exceptions
if ( $res === false ) mexit ( 'query «' . $query . '» (line ' . $line . ') failed: ' . mysqli_errno ( $link ) . ': ' . mysqli_error ( $link ) . '.' . N , 3 );
2022-12-12 08:17:01 +01:00
return ( $res );
2022-12-12 08:12:29 +01:00
}
2020-10-14 08:37:41 +02:00
$lockfp = __DIR__ . '/crawler.lock' ;
2022-05-06 06:29:19 +02:00
if ( file_exists ( $lockfp ) && ! $opts [ 'ignorelock' ]) {
2022-12-01 05:41:54 +01:00
eecho ( 3 , 'lock file «' . $lockfp . '» exists (if you are sure crawler.php is not already running you can use option «-I» to force execution).' . N );
exit ( 1 );
2020-10-13 08:21:26 +02:00
}
2022-05-06 06:29:19 +02:00
touch ( $lockfp );
2020-10-13 08:21:26 +02:00
2020-10-18 06:53:27 +02:00
$inifp = __DIR__ . '/../conf/mustard.ini' ;
2020-10-13 08:21:26 +02:00
$iniarr =@ parse_ini_file ( $inifp )
2022-12-01 05:41:54 +01:00
or mexit ( 'could not open config file «' . $inifp . '»' . N , 1 );
2020-10-13 08:21:26 +02:00
$link =@ mysqli_connect ( $iniarr [ 'db_host' ], $iniarr [ 'db_admin_name' ], $iniarr [ 'db_admin_password' ], $iniarr [ 'db_name' ], $iniarr [ 'db_port' ], $iniarr [ 'db_socket' ])
2022-12-01 05:41:54 +01:00
or mexit ( 'could not connect to MySQL server: ' . mysqli_connect_error () . N , 1 );
2020-10-13 08:21:26 +02:00
mysqli_set_charset ( $link , 'utf8mb4' )
2022-12-01 05:41:54 +01:00
or mexit ( 'could not set «utf8mb4» charset fro MySQL: ' . mysqli_error ( $link ) . N , 1 );
2020-10-13 08:21:26 +02:00
2020-10-21 15:26:31 +02:00
require ( __DIR__ . '/../site/mustard/include/tables.php' );
2020-10-13 08:21:26 +02:00
$tables = tables ( $link );
//print_r($tables);
2022-12-01 05:41:54 +01:00
$recover = false ;
2022-05-06 06:29:19 +02:00
$instsjfp = __DIR__ . '/instances.job' ;
$currinstjfp = __DIR__ . '/currinst.job' ;
if ( ! $opts [ 'dontrestore' ] && file_exists ( $currinstjfp ) && file_exists ( $instsjfp )) {
2022-12-10 13:57:30 +01:00
eecho ( 0 , 'looks like previous session was interrupted, trying to recover it...' . N );
2020-10-13 08:21:26 +02:00
$buf =@ file ( $instsjfp , FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES )
2022-12-01 05:41:54 +01:00
or mexit ( 'could not open file «' . $instsjfp . '» for reading.' . N , 1 );
2020-10-13 08:21:26 +02:00
$insts = array ();
foreach ( $buf as $line )
$insts [] = $line ;
$buf =@ file ( $currinstjfp , FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES )
2022-12-01 05:41:54 +01:00
or mexit ( 'could not open file «' . $currinstjfp . '» for reading.' . N , 1 );
2020-10-13 08:21:26 +02:00
$buf = explode ( " \t " , $buf [ 0 ]);
$currinst = array ( 'dom' => $buf [ 0 ], 'i' => $buf [ 1 ], 'qok' => $buf [ 2 ], 'qgood' => $buf [ 3 ]);
2022-12-01 05:41:54 +01:00
$recover = true ;
2022-12-10 13:57:30 +01:00
eecho ( 1 , 'recovered previous session.' . N );
2020-10-13 08:21:26 +02:00
}
function truncs ( $str , $tab , $col , $ctx ) {
2022-12-10 12:35:22 +01:00
global $tables , $iswin ;
if ( is_null ( $str )) return ( null );
2020-10-13 08:21:26 +02:00
if ( $iswin )
$tab = strtolower ( $tab );
$size = $tables [ $tab ][ $col ];
$len = mb_strlen ( $str , 'UTF-8' );
if ( $len > $size ) {
$str = mb_substr ( $str , 0 , $size - 1 , 'UTF-8' ) . '…' ;
2022-12-10 12:35:22 +01:00
notify ( $ctx . ': had to truncate string to ' . $size . ' chars to be able to insert it into «' . $col . '» column in «' . $tab . '» table.' , 3 );
2020-10-13 08:21:26 +02:00
}
return ( $str );
}
function truncn ( $num , $tab , $col , $ctx ) {
global $tables , $iswin ;
if ( $iswin )
$tab = strtolower ( $tab );
if ( is_numeric ( $num )) {
if ( $num > $tables [ $tab ][ $col ][ 'max' ]) {
2022-12-01 17:42:48 +01:00
notify ( $ctx . ': had to ceil «' . $num . '» to «' . $tables [ $tab ][ $col ][ 'max' ] . '», ie the maximum value it can have in column «' . $col . '» of table «' . $tab . '».' , 3 );
2020-10-13 08:21:26 +02:00
$num = $tables [ $tab ][ $col ][ 'max' ];
} elseif ( $num < $tables [ $tab ][ $col ][ 'min' ]) {
2022-12-01 17:42:48 +01:00
notify ( $ctx . ': had to floor «' . $num . '» to «' . $tables [ $tab ][ $col ][ 'min' ] . '», ie the minimum value it can have in column «' . $col . '» of table «' . $tab . '»).' , 3 );
2020-10-13 08:21:26 +02:00
$num = $tables [ $tab ][ $col ][ 'min' ];
}
} else {
2022-12-01 05:41:54 +01:00
notify ( $ctx . ': function «truncn»: expecting a number, got something else; returning «0».' , 3 );
2020-10-13 08:21:26 +02:00
$num = 0 ;
}
return ( $num );
}
/* $contextopts = array (
'http' => array (
'timeout' => $opts [ 'timeout' ]
),
'socket' => array (
'tcp_nodelay' => true
)
);
$context = stream_context_create ( $contextopts ); */
function pgdatetomy ( $pgdate ) {
//2018-04-07T15:05:26.801Z
if ( preg_match ( '/^(\d+)-(\d+)-(\d+)[ T]{1}(\d+):(\d+):(\d+)(\.\d+)?Z?$/' , $pgdate , $buf ) === 1 ) {
2022-11-01 07:44:55 +01:00
$mtime = gmmktime ( $buf [ 4 ], $buf [ 5 ], $buf [ 6 ], $buf [ 2 ], $buf [ 3 ], $buf [ 1 ]);
2020-10-13 08:21:26 +02:00
if ( array_key_exists ( 7 , $buf ))
$mtime = $mtime + floatval ( '0' . $buf [ 7 ]);
return ( $mtime );
} else {
2022-12-01 05:41:54 +01:00
notify ( 'Function «pgdatetomy»: «' . $pgdate . '» has not a recognized date format; returning current date.' , 3 );
2020-10-13 08:21:26 +02:00
return ( time ());
}
}
2022-12-01 05:41:54 +01:00
if ( ! $recover ) {
2020-10-13 08:21:26 +02:00
2022-11-30 07:19:14 +01:00
$insts = array ();
2020-10-13 08:21:26 +02:00
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT URI FROM Instances WHERE Dead=0' . $opts [ 'moreclauses' ], __LINE__ );
2020-10-13 08:21:26 +02:00
while ( $row = mysqli_fetch_assoc ( $res ))
if ( ! in_array ( $row [ 'URI' ], $insts ))
$insts [] = $row [ 'URI' ];
2022-12-01 05:41:54 +01:00
eecho ( 1 , 'loaded known, alive instances from the database into the list of instances to be checked.' . N );
2020-10-13 08:21:26 +02:00
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT URI FROM Instances WHERE Dead=1' , __LINE__ );
2020-10-13 08:21:26 +02:00
$deadinsts = array ();
while ( $row = mysqli_fetch_assoc ( $res ))
$deadinsts [] = $row [ 'URI' ];
2022-12-01 05:41:54 +01:00
eecho ( 1 , 'loaded dead instances into the corresponding list.' . N );
2020-10-13 08:21:26 +02:00
2020-10-13 17:48:55 +02:00
if ( ! is_null ( $opts [ 'peersfp' ])) {
2022-12-01 05:41:54 +01:00
eecho ( 0 , 'loading other instances to be checked from «' . $opts [ 'peersfp' ] . '».' . N );
2020-10-13 17:48:55 +02:00
$peers =@ file ( $opts [ 'peersfp' ], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES );
if ( $peers === false )
2022-12-01 05:41:54 +01:00
mexit ( 'could not open «' . $opts [ 'peersfp' ] . '» for reading.' . N , 1 );
2020-10-13 17:48:55 +02:00
foreach ( $peers as $pdom ) {
if ( ! in_array ( $pdom , $insts ))
if ( ! in_array ( $pdom , $deadinsts ))
if ( ! willtrunc ( $pdom , 'Instances' , 'URI' ))
$insts [] = $pdom ;
else
2022-12-10 13:57:30 +01:00
eecho ( 2 , 'ignoring instance «' . $pdom . '» because its hostname is too long for column «URI» of table «Instances».' . N );
2020-10-13 08:21:26 +02:00
else
2022-12-01 05:41:54 +01:00
eecho ( 1 , 'ignoring instance «' . $pdom . '» because it is dead.' . N );
2020-10-13 17:48:55 +02:00
}
2020-10-13 08:21:26 +02:00
}
2020-10-13 17:48:55 +02:00
2022-12-11 23:29:51 +01:00
unset ( $deadinsts );
2020-10-13 08:21:26 +02:00
sort ( $insts );
// shuffle($insts);
2022-12-01 05:41:54 +01:00
eecho ( 1 , count ( $insts ) . ' instances to be checked.' . N );
2020-10-13 08:21:26 +02:00
$instsf =@ fopen ( $instsjfp , 'w' )
2022-12-01 05:41:54 +01:00
or mexit ( 'could not open «' . $instsjfp . '» for writing.' . N , 1 );
foreach ( $insts as $host )
fwrite ( $instsf , $host . N );
2020-10-13 08:21:26 +02:00
fclose ( $instsf );
}
function willtrunc ( $str , $tab , $col ) {
global $tables , $iswin ;
if ( $iswin )
$tab = strtolower ( $tab );
if ( mb_strlen ( $str , 'UTF-8' ) > $tables [ $tab ][ $col ])
return ( true );
else
return ( false );
}
function b2i ( $bool , $pre ) {
if ( is_bool ( $bool )) {
if ( $bool )
return ( 1 );
else
return ( 0 );
} else {
2022-12-01 05:41:54 +01:00
notify ( $pre . '«' . $bool . '» is not a boolean value, returning «0».' , 3 );
2020-10-13 08:21:26 +02:00
return ( 0 );
}
}
//is array, array key exists and value is not null
function akeavinn ( $key , & $arr ) {
if ( is_array ( $arr ) && array_key_exists ( $key , $arr ) && ! is_null ( $arr [ $key ]))
return ( true );
else
return ( false );
}
function nempty ( $str ) {
if ( preg_match ( '/^\s*$/' , $str ) === 1 )
return ( null );
else
return ( $str );
}
function subarimp ( $glue , $key , & $arr ) {
$str = '' ;
$i = 1 ;
$carr = count ( $arr );
foreach ( $arr as $inarr ) {
$str .= $inarr [ $key ];
if ( $i < $carr )
$str .= $glue ;
$i ++ ;
}
return ( $str );
}
function notify ( $msg , $sev ) {
2022-12-01 17:42:48 +01:00
// notify "Severity" should be called "Importance"; anyway, it is to be thought of as "$lev" param of function "eecho": 0=debug, 1=info, 2=warning, 3=error
2020-10-13 08:21:26 +02:00
global $link , $tables , $iswin , $opts ;
2022-12-01 17:42:48 +01:00
eecho ( $sev , '*notification*: ' . strip_tags ( $msg ) . N );
2020-10-13 08:21:26 +02:00
$tab = 'Notifications' ;
2022-12-01 05:41:54 +01:00
if ( $iswin ) $tab = 'notifications' ;
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'INSERT INTO Notifications (ID, Notification, Severity, Microtime, Seen, Deleted) VALUES (NULL, \'' . myesc ( $link , mb_substr ( $msg , 0 , $tables [ $tab ][ 'Notification' ], 'UTF-8' )) . '\', ' . $sev . ', \'' . microtime ( true ) . '\', 0, 0)' , __LINE__ );
2020-10-13 08:21:26 +02:00
}
/** <LANGUAGE MANAGEMENT> */
/**
2022-12-01 05:41:54 +01:00
* Executes a call to Mastodon API .
2020-10-13 08:21:26 +02:00
*
2022-12-01 05:41:54 +01:00
* @ param string $host Host to be called ( e . g .: " mastodon.bida.im " )
* @ param string $path API path ( e . g .: " /api/v1/timelines/public?local=true " )
* @ return mixed An array representing the JSON object as returned by json_decode , or NULL if the call fails
2020-10-13 08:21:26 +02:00
*/
function get_api ( $host , $path ) {
2022-12-01 05:41:54 +01:00
global $opts ;
2022-12-16 21:59:26 +01:00
$buf = @ gurl ( 'https://' . $host . $path , $opts [ 'timeout' ]);
2022-12-01 05:41:54 +01:00
if ( $buf [ 'cont' ] !== false ) {
2022-12-09 22:53:18 +01:00
ckratelimit ( $buf [ 'headers' ]);
2022-12-01 05:41:54 +01:00
$data = json_decode ( $buf [ 'cont' ], true );
return $data ;
} else {
return NULL ;
}
2020-10-13 08:21:26 +02:00
}
/**
2022-12-01 05:41:54 +01:00
* Returns a list of known recognized languages , with the related probability , fot the toot that got passed to it
2020-10-13 08:21:26 +02:00
*
2022-12-01 05:41:54 +01:00
* @ param mixed $toot The toot to be checked , as returned by the API
* @ return array Associative array with language and related probability
2020-10-13 08:21:26 +02:00
*/
function get_toot_languages ( $toot ) {
if ( is_array ( $toot ) && array_key_exists ( 'language' , $toot ))
$l = $toot [ 'language' ];
else
$l = NULL ;
if ( $l !== NULL ) {
2022-12-01 05:41:54 +01:00
// the language is explicitly set in the toot, so use that
2020-10-13 08:21:26 +02:00
$langs [ $l ] = 1 ;
2022-05-06 09:21:56 +02:00
} elseif ( array_key_exists ( 'content' , $toot )) {
2022-12-01 05:41:54 +01:00
// the language is not explicitly set in the toot, so try and recognize it
2020-10-13 08:21:26 +02:00
$text = strip_tags ( $toot [ 'content' ]);
$ld = new Language ;
$langs = $ld -> detect ( $text ) -> bestResults () -> close ();
}
2022-12-01 05:41:54 +01:00
// group derived languages into two-charactes language code (e.g.: "zh-CN" into "zh")
2020-10-13 08:21:26 +02:00
$grouped_langs = array ();
foreach ( $langs as $key => $value ) {
$l = explode ( " - " , $key )[ 0 ];
if ( array_key_exists ( $l , $grouped_langs )) {
$grouped_langs [ $l ] = max ( $grouped_langs [ $l ], $value );
} else {
$grouped_langs [ $l ] = $value ;
}
}
return $grouped_langs ;
}
/**
2022-12-01 05:41:54 +01:00
* Given the probability of a language for every toot , calculate the average
2020-10-13 08:21:26 +02:00
*
2022-12-01 05:41:54 +01:00
* @ param array $detected_langs Array of mappings between language and probability
* @ return array Mapping between language and probability
2020-10-13 08:21:26 +02:00
*/
function summary ( $detected_langs ) {
$res = Array ();
foreach ( $detected_langs as $langs ) {
foreach ( $langs as $l => $weight ) {
if ( ! array_key_exists ( $l , $res )) {
$res [ $l ] = 0 ;
}
$res [ $l ] += $weight ;
}
}
foreach ( $res as $l => $sumweight ) {
$res [ $l ] = $sumweight / count ( $detected_langs );
}
return $res ;
}
/**
2022-12-01 05:41:54 +01:00
* Helper function for usort : compares two arrays using the first element
2020-10-13 08:21:26 +02:00
*
2022-12-01 05:41:54 +01:00
* @ param array $entry1 First array to be compared
* @ param array $entry2 Second array to be compared
* @ return number - 1 , 0 o 1 depening on $entry1 [ 0 ] being less than , equal to or greater than $entry2 [ 0 ]
2020-10-13 08:21:26 +02:00
*/
function sort_weights ( $entry1 , $entry2 ) {
$w1 = $entry1 [ 0 ];
$w2 = $entry2 [ 0 ];
if ( $w1 < $w2 )
$ret = 1 ;
elseif ( $w1 == $w2 )
$ret = 0 ;
else
$ret =- 1 ;
return $ret ;
}
/**
2022-12-01 05:41:54 +01:00
* Given a language mapping , return a list of probable languages
2020-10-13 08:21:26 +02:00
*
2022-12-01 05:41:54 +01:00
* @ param array $summary Map between language and probabilty
* @ return string [] List of probable languages
2020-10-13 08:21:26 +02:00
*/
function get_languages ( $summary ) {
$lst = [];
foreach ( $summary as $code => $weight ) {
$lst [] = [ $weight , $code ];
}
usort ( $lst , 'sort_weights' );
$languages = [];
$lastweight = 0 ;
foreach ( $lst as $entry ) {
$l = $entry [ 1 ];
$weight = $entry [ 0 ];
if ( $weight < $lastweight * 2 / 3 ) {
break ;
}
$languages [] = $l ;
$lastweight = $weight ;
}
return $languages ;
}
/**
2022-12-01 05:41:54 +01:00
* Returns a list of probable languages for the given instance
2020-10-13 08:21:26 +02:00
*
2022-12-01 05:41:54 +01:00
* @ param string $host Instance’ s hostname ( e . g .: " mastodon.bida.im " )
* @ return string [] List of probable languages
2020-10-13 08:21:26 +02:00
*/
function get_instance_langs ( $host ) {
global $opts ;
$data = get_api ( $host , '/api/v1/timelines/public?local=true&limit=' . $opts [ 'ldtoots' ]);
if ( $data == NULL ) {
return [];
}
$detected_langs = array_map ( 'get_toot_languages' , $data );
$summary = summary ( $detected_langs );
$languages = get_languages ( $summary );
return $languages ;
}
2020-10-21 15:26:31 +02:00
require ( __DIR__ . '/../site/mustard/include/mb_ucfirst.php' );
2020-10-13 08:21:26 +02:00
function langs ( $instid , $uri , $auto ) {
global $info , $instrow , $link , $opts ;
$retlangs = array ();
$languages = array ();
2022-11-23 20:35:53 +01:00
// even if $auto is true, set it to false (don't do autodection of languages based on last toots) if api/v1/instance returned a language different from the default "en": assume instead it is right, because it has been explicitly set
if ( isset ( $info [ 'languages' ][ 0 ]) && $info [ 'languages' ][ 0 ] != 'en' )
$auto = false ;
2020-10-13 08:21:26 +02:00
if ( $auto ) {
$languages = get_instance_langs ( $uri );
} elseif ( akeavinn ( 'languages' , $info )) {
$languages = $info [ 'languages' ];
}
if ( count ( $languages ) == 0 ) {
return ( $retlangs );
} else {
2022-12-12 08:29:18 +01:00
while ( count ( $languages ) > 5 )
array_pop ( $languages );
2022-12-08 13:53:43 +01:00
foreach ( $languages as $key => $val )
$languages [ $key ] = str_replace ( '-' , '_' , $val );
2020-10-13 08:21:26 +02:00
if ( $auto )
2022-12-01 05:41:54 +01:00
eecho ( 1 , 'detected languages: ' . implode ( ', ' , $languages ) . N );
2020-10-13 08:21:26 +02:00
else
2022-12-01 05:41:54 +01:00
eecho ( 1 , 'declared languages: ' . implode ( ', ' , $languages ) . N );
2020-10-13 08:21:26 +02:00
$pos = 0 ;
foreach ( $languages as $lang ) {
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT * FROM Languages WHERE Code=\'' . myesc ( $link , $lang ) . '\'' , __LINE__ );
2020-10-13 08:21:26 +02:00
if ( mysqli_num_rows ( $res ) < 1 ) {
$code = myesc ( $link , truncs ( $lang , 'Languages' , 'Code' , '«' . $instrow [ 'URI' ] . '»' ));
$NameOrig = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , $lang )), 'Languages' , 'NameOrig' , '«' . $instrow [ 'URI' ] . '»' ));
2022-12-08 13:53:43 +01:00
$NamePt_BR = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'pt_BR' )), 'Languages' , 'NamePT_BR' , '«' . $instrow [ 'URI' ] . '»' ));
2022-12-02 16:29:24 +01:00
$NameDe = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'de' )), 'Languages' , 'NameDE' , '«' . $instrow [ 'URI' ] . '»' ));
2022-11-23 19:30:32 +01:00
$NameUk = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'uk' )), 'Languages' , 'NameUK' , '«' . $instrow [ 'URI' ] . '»' ));
2020-10-13 08:21:26 +02:00
$NameCa = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'ca' )), 'Languages' , 'NameCA' , '«' . $instrow [ 'URI' ] . '»' ));
$NameEn = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'en' )), 'Languages' , 'NameEN' , '«' . $instrow [ 'URI' ] . '»' ));
$NameEs = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'es' )), 'Languages' , 'NameES' , '«' . $instrow [ 'URI' ] . '»' ));
$NameFr = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'fr' )), 'Languages' , 'NameFR' , '«' . $instrow [ 'URI' ] . '»' ));
2022-11-23 19:30:32 +01:00
$NameGl = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'gl' )), 'Languages' , 'NameGL' , '«' . $instrow [ 'URI' ] . '»' ));
2020-10-13 08:21:26 +02:00
$NameIt = myesc ( $link , truncs ( mb_ucfirst ( locale_get_display_name ( $lang , 'it' )), 'Languages' , 'NameIT' , '«' . $instrow [ 'URI' ] . '»' ));
2022-12-12 08:24:26 +01:00
$q = 'INSERT INTO Languages (ID, Code, NameOrig, NamePT_BR, NameDE, NameUK, NameCA, NameEN, NameES, NameFR, NameGL, NameIT) VALUES (NULL, \'' . $code . '\', \'' . $NameOrig . '\', \'' . $NamePt_BR . '\', \'' . $NameDe . '\', \'' . $NameUk . '\', \'' . $NameCa . '\', \'' . $NameEn . '\', \'' . $NameEs . '\', \'' . $NameFr . '\', \'' . $NameGl . '\', \'' . $NameIt . '\')' ;
2020-10-13 08:21:26 +02:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , $q , __LINE__ );
2020-10-13 08:21:26 +02:00
$langid = mysqli_insert_id ( $link );
} else {
$langid = 0 ;
}
} else {
$row = mysqli_fetch_assoc ( $res );
$langid = $row [ 'ID' ];
}
$pos ++ ;
$retlangs [] = array ( 'InstID' => $instid , 'LangID' => $langid , 'Pos' => $pos , 'Code' => $lang );
}
}
return ( $retlangs );
}
function varbdump ( $var ) {
ob_start ();
var_dump ( $var );
$content = ob_get_contents ();
ob_end_clean ();
return ( $content );
}
function mdasortbykey ( & $arr , $key , $rev = false ) {
$karr = array ();
foreach ( $arr as $akey => $subarr )
2022-05-06 06:29:19 +02:00
$karr [ round ( $subarr [ $key ] * 10000000000000 , 0 )] = array ( $akey , $subarr );
2020-10-13 08:21:26 +02:00
if ( ! $rev )
ksort ( $karr );
else
krsort ( $karr );
$arr = array ();
foreach ( $karr as $akey => $subarr )
$arr [ $subarr [ 0 ]] = $subarr [ 1 ];
}
2020-10-21 15:26:31 +02:00
require ( __DIR__ . '/../site/mustard/include/ghs.php' );
2020-10-13 08:21:26 +02:00
2020-10-21 15:26:31 +02:00
require ( __DIR__ . '/../site/mustard/include/ght.php' );
2020-10-13 08:21:26 +02:00
/*
2022-12-01 05:41:54 +01:00
* Nodeinfo ( 'https://' . $host . '/nodeinfo/2.0.json' ) was added in v3 . 0.0
* Trends ( 'https://' . $host . '/api/v1/trends' ) was added in v3 . 0.0
* Activity ( 'https://' . $host . '/api/v1/instance/activity' ) was added in v2 . 1.2
2020-10-13 08:21:26 +02:00
*/
if ( $opts [ 'jsonwrite' ]) {
2022-12-01 05:41:54 +01:00
if ( $recover )
$mode = array ( 'a' , 'append' );
2020-10-13 08:21:26 +02:00
else
2022-12-01 05:41:54 +01:00
$mode = array ( 'w' , 'write' );
2020-10-13 08:21:26 +02:00
$jsonf =@ fopen ( $opts [ 'jsonfp' ], $mode [ 0 ])
2022-12-01 05:41:54 +01:00
or mexit ( 'could not open file «' . $opts [ 'jsonfp' ] . '» in ' . $mode [ 1 ] . ' mode.' , 1 );
2020-10-13 08:21:26 +02:00
if ( $mode [ 0 ] == 'w' )
fwrite ( $jsonf , '{' . N );
}
2022-12-11 23:29:51 +01:00
2020-10-13 08:21:26 +02:00
$tini = time ();
$cinsts = count ( $insts );
$i = 0 ;
$qok = 0 ;
$qgood = 0 ;
2022-12-01 05:41:54 +01:00
if ( $recover ) {
2020-10-13 08:21:26 +02:00
$i = $currinst [ 'i' ];
$qok = $currinst [ 'qok' ];
$qgood = $currinst [ 'qgood' ];
}
$beg = $i ;
while ( $i < $cinsts ) {
2022-12-01 05:41:54 +01:00
$now = time ();
$host = $insts [ $i ];
@ file_put_contents ( $currinstjfp , $host . " \t " . $i . " \t " . $qok . " \t " . $qgood . N )
or mexit ( 'could not open «' . $currinstjfp . '» for writing.' , 1 );
2020-10-13 08:21:26 +02:00
$i ++ ;
$ismast = null ;
$instans = true ;
$info = null ;
2022-12-01 05:41:54 +01:00
$tela = $now - $tini ;
eecho ( 1 , 'working on «' . $host . '»; ' . $i . '/' . $cinsts . '; ' . $qok . ' ok; ' . $qgood . ' good; ' . round ( 100 / $cinsts * $i ) . '%; elapsed time: ' . ght ( $tela , null , 0 ) . '; estimated remaining time: ' . ght ( $tela / $i * ( $cinsts - $beg ) - $tela , null , 0 ) . '; mem.: ' . ghs ( memory_get_usage ( true )) . '; mem. peak: ' . ghs ( memory_get_peak_usage ( true )) . N );
if ( willtrunc ( $host , 'Instances' , 'URI' )) {
2022-12-09 22:53:18 +01:00
eecho ( 2 , '«' . $host . '»: ignoring it because hostname is too long for the «URI» column of «Instances» table.' . N );
2020-10-13 08:21:26 +02:00
} else {
2022-12-09 22:53:18 +01:00
eecho ( 0 , '«' . $host . '»: trying to fetch instance info from API...' . N );
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( 'https://' . $host . '/api/v1/instance' , $opts [ 'timeout' ]);
2020-10-13 08:21:26 +02:00
if ( $buf [ 'cont' ] !== false ) {
2022-12-09 22:53:18 +01:00
ckratelimit ( $buf [ 'headers' ]);
2022-12-01 05:41:54 +01:00
$info =@ json_decode ( $buf [ 'cont' ], true );
2020-10-13 08:21:26 +02:00
if ( is_array ( $info )) {
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $host . '»: got instance info from API :-)' . N );
2022-12-11 23:29:51 +01:00
eecho ( 0 , '«' . $host . '»: trying to fetch nodeinfo specs on https...' . N );
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( 'https://' . $host . '/.well-known/nodeinfo' , $opts [ 'timeout' ]);
2022-12-11 23:29:51 +01:00
if ( $buf [ 'cont' ] === false ) {
eecho ( 0 , '«' . $host . '»: trying to fetch nodeinfo specs on http...' . N );
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( 'http://' . $host . '/.well-known/nodeinfo' , $opts [ 'timeout' ]);
2022-12-11 23:29:51 +01:00
}
2020-10-13 08:21:26 +02:00
if ( $buf [ 'cont' ] !== false ) {
2022-12-11 23:29:51 +01:00
$buf =@ json_decode ( $buf [ 'cont' ], true );
if ( is_array ( $buf ) && array_key_exists ( 'links' , $buf ) && is_array ( $buf [ 'links' ]) && count ( $buf [ 'links' ]) > 0 ) {
$nirefs = [];
foreach ( $buf [ 'links' ] as $key => $niref )
if ( isset ( $niref [ 'rel' ]) && isset ( $niref [ 'href' ]))
$nirefs [ $niref [ 'rel' ]] = $niref [ 'href' ];
else
eecho ( 2 , '«' . $host . '»: nodeinfo specs link ' . $key . ' has unexpected format.' . N );
krsort ( $nirefs );
$niref = array_shift ( $nirefs );
eecho ( 0 , '«' . $host . '»: got nodeinfo specs; trying to fetch nodeinfo...' . N );
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( $niref , $opts [ 'timeout' ]);
2022-12-11 23:29:51 +01:00
if ( $buf [ 'cont' ] !== false ) {
$buf =@ json_decode ( $buf [ 'cont' ], true );
if ( is_array ( $buf ) && isset ( $buf [ 'software' ][ 'name' ]) && isset ( $buf [ 'software' ][ 'version' ])) {
2022-12-12 00:47:06 +01:00
$info [ 'x-nodeinfo' ] = $buf ;
if ( preg_match ( '/^mastodon|fedibird|ecko|hometown/' , $info [ 'x-nodeinfo' ][ 'software' ][ 'name' ]) === 1 )
2022-12-11 23:29:51 +01:00
$ismast = true ;
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT Name FROM Platforms WHERE Name=\'' . myesc ( $link , $info [ 'x-nodeinfo' ][ 'software' ][ 'name' ]) . '\'' , __LINE__ );
2022-12-11 23:29:51 +01:00
if ( mysqli_num_rows ( $res ) < 1 ) {
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'INSERT INTO Platforms (Name) VALUES (\'' . myesc ( $link , truncs ( $info [ 'x-nodeinfo' ][ 'software' ][ 'name' ], 'Platforms' , 'Name' , '«' . $host . '»' )) . '\')' , __LINE__ )
2022-12-11 23:29:51 +01:00
or mexit ( __LINE__ . ': ' . mysqli_error ( $link ) . N , 3 );
2022-12-12 00:47:06 +01:00
notify ( 'New software found: «' . $host . '» runs on «' . $info [ 'x-nodeinfo' ][ 'software' ][ 'name' ] . '»; i added it to the table of known softwares. It would be good to check whether it is a Mastodon derivate and how compatible it is, to decide whether to consider instances using it as Mastodon instances.' , 2 );
2022-12-11 23:29:51 +01:00
}
} else {
eecho ( 2 , '«' . $host . '»: nodeinfo was not good json or json had unexpected format.' . N );
}
2020-10-13 08:21:26 +02:00
}
2022-12-11 23:29:51 +01:00
} else {
eecho ( 2 , '«' . $host . '»: nodeinfo specs where not good json or json had unexpected format.' . N );
2020-10-13 08:21:26 +02:00
}
2022-12-15 12:45:20 +01:00
} else {
eecho ( 2 , '«' . $host . '»: could not retrieve nodeinfo specs.' . N );
2020-10-13 08:21:26 +02:00
}
if ( array_key_exists ( 'version' , $info )) {
2022-12-11 23:29:51 +01:00
eecho ( 1 , '«' . $host . '» software version is «' . $info [ 'version' ] . '».' . N );
2020-10-13 08:21:26 +02:00
if ( $info [ 'version' ] >= '2.1.2' ) {
2022-12-09 22:53:18 +01:00
eecho ( 0 , '«' . $host . '»: trying to fetch instance activity info from API...' . N );
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( 'https://' . $host . '/api/v1/instance/activity' , $opts [ 'timeout' ]);
2020-10-13 08:21:26 +02:00
if ( $buf [ 'cont' ] !== false ) {
2022-12-09 22:53:18 +01:00
ckratelimit ( $buf [ 'headers' ]);
eecho ( 1 , '«' . $host . '»: got instance activity info from API :-)' . N );
2020-10-13 08:21:26 +02:00
$info [ 'x-activity' ] = json_decode ( $buf [ 'cont' ], true );
} else {
2022-12-09 22:53:18 +01:00
eecho ( 2 , '«' . $host . '»: could not fetch instance activity from API: ' . $buf [ 'emsg' ] . N );
2020-10-13 08:21:26 +02:00
}
}
if ( $info [ 'version' ] >= '3.0.0' ) {
2022-12-09 22:53:18 +01:00
eecho ( 0 , '«' . $host . '»: trying to fetch instance trends info from API...' . N );
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( 'https://' . $host . '/api/v1/trends' , $opts [ 'timeout' ]);
2020-10-13 08:21:26 +02:00
if ( $buf [ 'cont' ] !== false ) {
2022-12-09 22:53:18 +01:00
ckratelimit ( $buf [ 'headers' ]);
eecho ( 1 , '«' . $host . '»: got instance trends info from API :-)' . N );
2020-10-13 08:21:26 +02:00
$info [ 'x-trends' ] = json_decode ( $buf [ 'cont' ], true );
} else {
2022-12-09 22:53:18 +01:00
eecho ( 2 , '«' . $host . '»: could not fetch instance trends from API: ' . $buf [ 'emsg' ] . N );
2020-10-13 08:21:26 +02:00
}
}
}
} else {
$instans = false ;
2022-12-09 22:53:18 +01:00
eecho ( 2 , '«' . $host . '»: fetched data were not good JSON.' . N );
2020-10-13 08:21:26 +02:00
}
} else {
$instans = false ;
2022-12-09 22:53:18 +01:00
eecho ( 2 , '«' . $host . '»: could not fetch instance info from API: ' . $buf [ 'emsg' ] . N );
2020-10-13 08:21:26 +02:00
}
if ( ! isset ( $info [ 'uri' ]) || preg_match ( '#^\s*$#' , $info [ 'uri' ]) === 1 )
$instans = false ;
if ( is_array ( $info ) && count ( $info ) > 0 ) {
2022-12-01 05:41:54 +01:00
//echo('json dump of all fetched info:'.N.json_encode($info,JSON_PRETTY_PRINT).N);
2020-10-13 08:21:26 +02:00
if ( $opts [ 'jsonwrite' ])
2022-12-01 05:41:54 +01:00
fwrite ( $jsonf , '"' . $host . '": ' . json_encode ( $info , JSON_PRETTY_PRINT ) . ',' . N );
2020-10-13 08:21:26 +02:00
}
if ( ! $instans ) {
2022-12-01 05:41:54 +01:00
// this is the limbo of non-responding instances
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT * FROM Instances WHERE URI=\'' . myesc ( $link , $host ) . '\'' , __LINE__ );
2022-12-01 05:41:54 +01:00
$nrows = mysqli_num_rows ( $res );
if ( $nrows == 1 ) {
2022-12-11 23:29:51 +01:00
eecho ( 1 , '«' . $host . '»: didn’ t respond, but it is present in the database; updating InstChecks, Instances.LastCheckOk and possibly Instances.New=0 and Instances.Dead=1.' . N );
2020-10-13 08:21:26 +02:00
$row = mysqli_fetch_assoc ( $res );
2022-12-11 23:29:51 +01:00
$instid = $row [ 'ID' ];
2022-12-12 08:36:18 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'UPDATE Instances SET LastCheckOk=0 WHERE ID=' . $instid , __LINE__ );
2022-12-01 05:41:54 +01:00
if ( $row [ 'New' ] == 1 && ! is_null ( $row [ 'FirstSeen' ]) && $now - $row [ 'FirstSeen' ] > $opts [ 'oldline' ]) {
2022-12-11 23:29:51 +01:00
notify ( 'Instance «<a href="viewinst.php?id=' . $instid . '">' . $row [ 'URI' ] . '</a>» is no longer new.' , 2 );
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'UPDATE Instances SET New=0 WHERE ID=' . $instid , __LINE__ );
2022-12-01 05:41:54 +01:00
}
2020-10-13 08:21:26 +02:00
2022-12-01 05:41:54 +01:00
// we check the last time instance responded, if ever
2022-12-12 08:12:29 +01:00
$rres = myq ( $link , 'SELECT Time FROM InstChecks WHERE InstID=' . $instid . ' AND Status=1 ORDER BY Time DESC LIMIT 1' , __LINE__ );
2022-12-01 05:41:54 +01:00
// if instance never responded we consider the time of first check
2022-12-12 08:12:29 +01:00
if ( mysqli_num_rows ( $rres ) == 0 )
$rres = myq ( $link , 'SELECT Time FROM InstChecks WHERE InstID=' . $instid . ' AND Status=0 ORDER BY Time ASC LIMIT 1' , __LINE__ );
2020-10-13 08:21:26 +02:00
if ( mysqli_num_rows ( $rres ) > 0 ) {
$rrow = mysqli_fetch_assoc ( $rres );
if ( $now - $rrow [ 'Time' ] > $opts [ 'deadline' ]) {
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'UPDATE Instances SET Dead=1 WHERE ID=' . $instid , __LINE__ );
2022-12-11 23:29:51 +01:00
notify ( 'Instance «<a href="viewinst.php?id=' . $instid . '">' . $row [ 'URI' ] . '</a>» is dead!' , 2 );
2020-10-13 08:21:26 +02:00
}
} else {
2022-12-11 23:29:51 +01:00
eecho ( 2 , '«' . $host . '»: exists in the database but there’ s no data about it in InstChecks!' . N );
2020-10-13 08:21:26 +02:00
}
2022-12-01 05:41:54 +01:00
} elseif ( $nrows == 0 ) {
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $host . '»: doesn’ t respond and is not in the database, adding it.' . N );
2022-12-15 12:45:20 +01:00
// "FirstSeen=NULL" because it's not seen until it responds for the first time
2020-10-13 08:21:26 +02:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-15 12:45:20 +01:00
myq ( $link , 'INSERT INTO Instances SET FirstSeen=NULL, New=1, Good=0, Chosen=0, Visible=0, Noxious=0, URI=\'' . myesc ( $link , $host ) . '\', LastCheckOk=0, InsertTS=' . $now , __LINE__ );
2020-10-13 08:21:26 +02:00
$instid = mysqli_insert_id ( $link );
2022-12-12 08:12:29 +01:00
myq ( $link , 'INSERT INTO InstChecks SET InstID=' . $instid . ', Time=' . $now . ', Status=0' , __LINE__ );
2020-10-13 08:21:26 +02:00
} else {
$instid = 0 ;
}
2022-12-01 05:41:54 +01:00
} else {
2022-12-11 23:29:51 +01:00
notify ( 'Instance «' . $host . '» has ' . $nrows . ' entries in «Instances» table!' , 3 );
2020-10-13 08:21:26 +02:00
}
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'INSERT INTO InstChecks (InstID, Time, Status) VALUES (' . $instid . ', ' . $now . ', 0)' , __LINE__ );
2020-10-13 08:21:26 +02:00
} else {
2022-12-01 05:41:54 +01:00
// instance responded
2020-10-13 08:21:26 +02:00
if ( is_null ( $ismast )) {
2022-12-12 22:40:17 +01:00
if ( array_key_exists ( 'pleroma' , $info )) {
2020-10-13 08:21:26 +02:00
$ismast = false ;
2022-12-12 22:40:17 +01:00
} elseif ( preg_match ( '#(pleroma|pixelfed)#i' , $info [ 'version' ]) == 1 ) {
2020-10-13 08:21:26 +02:00
$ismast = false ;
} elseif ( preg_match ( '#^[0-9]+\.[0-9]+\.[0-9]+#' , $info [ 'version' ]) !== 1 ) {
$ismast = false ;
}
}
$qok ++ ;
if ( ! is_null ( $ismast ))
( $ismast ) ? $ismast = 1 : $ismast = 0 ;
2022-11-30 07:19:14 +01:00
$instrow = array ( 'ID' => null , 'FirstSeen' => null , 'IsMastodon' => $ismast , 'Dead' => 0 , 'New' => 0 , 'Good' => 0 , 'Chosen' => 0 , 'Priority' => null , 'Visible' => 0 , 'Noxious' => 0 , 'NoxReason' => null , 'NoxLastModTS' => null , 'URI' => null , 'Title' => null , 'ShortDesc' => null , 'LongDesc' => null , 'OurDesc' => null , 'OurDescEN' => null , 'LocalityID' => null , 'OurLangsLock' => 0 , 'Email' => null , 'Software' => null , 'Version' => null , 'UserCount' => null , 'StatusCount' => null , 'DomainCount' => null , 'ActiveUsersMonth' => null , 'ActiveUsersHalfYear' => null , 'Thumb' => null , 'RegOpen' => null , 'RegReqApproval' => null , 'MaxTootChars' => null , 'AdmAccount' => null , 'AdmDisplayName' => null , 'AdmCreatedAt' => null , 'AdmNote' => null , 'AdmURL' => null , 'AdmAvatar' => null , 'AdmHeader' => null , 'LastCheckOk' => 1 , 'GuestID' => null , 'LastGuestEdit' => null );
2022-12-11 23:29:51 +01:00
$instrow [ 'URI' ] = $host ;
2020-10-13 08:21:26 +02:00
if ( akeavinn ( 'title' , $info ))
$instrow [ 'Title' ] = nempty ( truncs ( $info [ 'title' ], 'Instances' , 'Title' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'short_description' , $info ))
$instrow [ 'ShortDesc' ] = nempty ( truncs ( $info [ 'short_description' ], 'Instances' , 'ShortDesc' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'description' , $info ))
$instrow [ 'LongDesc' ] = nempty ( truncs ( $info [ 'description' ], 'Instances' , 'LongDesc' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'email' , $info ))
$instrow [ 'Email' ] = nempty ( truncs ( $info [ 'email' ], 'Instances' , 'Email' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'version' , $info ))
$instrow [ 'Version' ] = nempty ( truncs ( $info [ 'version' ], 'Instances' , 'Version' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'stats' , $info )) {
if ( akeavinn ( 'user_count' , $info [ 'stats' ]))
$instrow [ 'UserCount' ] = truncn ( $info [ 'stats' ][ 'user_count' ], 'Instances' , 'UserCount' , '«' . $instrow [ 'URI' ] . '»' );
if ( akeavinn ( 'status_count' , $info [ 'stats' ]))
$instrow [ 'StatusCount' ] = truncn ( $info [ 'stats' ][ 'status_count' ], 'Instances' , 'StatusCount' , '«' . $instrow [ 'URI' ] . '»' );
if ( akeavinn ( 'domain_count' , $info [ 'stats' ]))
$instrow [ 'DomainCount' ] = truncn ( $info [ 'stats' ][ 'domain_count' ], 'Instances' , 'DomainCount' , '«' . $instrow [ 'URI' ] . '»' );
}
if ( akeavinn ( 'thumbnail' , $info ))
$instrow [ 'Thumb' ] = nempty ( truncs ( $info [ 'thumbnail' ], 'Instances' , 'Thumb' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'max_toot_chars' , $info ))
$instrow [ 'MaxTootChars' ] = truncn ( $info [ 'max_toot_chars' ], 'Instances' , 'MaxTootChars' , '«' . $instrow [ 'URI' ] . '»' );
if ( akeavinn ( 'registrations' , $info ))
$instrow [ 'RegOpen' ] = b2i ( $info [ 'registrations' ], 'Istanza «' . $instrow [ 'URI' ] . '»: ' );
if ( akeavinn ( 'approval_required' , $info ))
$instrow [ 'RegReqApproval' ] = b2i ( $info [ 'approval_required' ], 'Istanza «' . $instrow [ 'URI' ] . '»: ' );
if ( akeavinn ( 'contact_account' , $info )) {
if ( akeavinn ( 'acct' , $info [ 'contact_account' ]))
$instrow [ 'AdmAccount' ] = nempty ( truncs ( $info [ 'contact_account' ][ 'acct' ], 'Instances' , 'AdmAccount' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'display_name' , $info [ 'contact_account' ]))
$instrow [ 'AdmDisplayName' ] = nempty ( truncs ( $info [ 'contact_account' ][ 'display_name' ], 'Instances' , 'AdmDisplayName' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'created_at' , $info [ 'contact_account' ]))
$instrow [ 'AdmCreatedAt' ] = pgdatetomy ( $info [ 'contact_account' ][ 'created_at' ]);
if ( akeavinn ( 'note' , $info [ 'contact_account' ]))
2020-10-15 19:24:45 +02:00
$instrow [ 'AdmNote' ] = nempty ( truncs ( $info [ 'contact_account' ][ 'note' ], 'Instances' , 'AdmNote' , '«' . $instrow [ 'URI' ] . '»' ));
2020-10-13 08:21:26 +02:00
if ( akeavinn ( 'url' , $info [ 'contact_account' ]))
$instrow [ 'AdmURL' ] = nempty ( truncs ( $info [ 'contact_account' ][ 'url' ], 'Instances' , 'AdmURL' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'avatar' , $info [ 'contact_account' ]))
$instrow [ 'AdmAvatar' ] = nempty ( truncs ( $info [ 'contact_account' ][ 'avatar' ], 'Instances' , 'AdmAvatar' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'header' , $info [ 'contact_account' ]))
$instrow [ 'AdmHeader' ] = nempty ( truncs ( $info [ 'contact_account' ][ 'header' ], 'Instances' , 'AdmHeader' , '«' . $instrow [ 'URI' ] . '»' ));
}
if ( akeavinn ( 'x-nodeinfo' , $info )) {
if ( akeavinn ( 'software' , $info [ 'x-nodeinfo' ]) && akeavinn ( 'name' , $info [ 'x-nodeinfo' ][ 'software' ]))
$instrow [ 'Software' ] = nempty ( truncs ( $info [ 'x-nodeinfo' ][ 'software' ][ 'name' ], 'Instances' , 'Software' , '«' . $instrow [ 'URI' ] . '»' ));
if ( akeavinn ( 'usage' , $info [ 'x-nodeinfo' ]) && akeavinn ( 'users' , $info [ 'x-nodeinfo' ][ 'usage' ])) {
if ( akeavinn ( 'activeMonth' , $info [ 'x-nodeinfo' ][ 'usage' ][ 'users' ]))
$instrow [ 'ActiveUsersMonth' ] = truncn ( $info [ 'x-nodeinfo' ][ 'usage' ][ 'users' ][ 'activeMonth' ], 'Instances' , 'ActiveUsersMonth' , '«' . $instrow [ 'URI' ] . '»' );
if ( akeavinn ( 'activeHalfyear' , $info [ 'x-nodeinfo' ][ 'usage' ][ 'users' ]))
$instrow [ 'ActiveUsersHalfYear' ] = truncn ( $info [ 'x-nodeinfo' ][ 'usage' ][ 'users' ][ 'activeHalfyear' ], 'Instances' , 'ActiveUsersHalfYear' , '«' . $instrow [ 'URI' ] . '»' );
}
}
$whynot = array ();
if ( is_null ( $instrow [ 'RegOpen' ])) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'we don’ t know if it allows registrations' ;
2020-10-13 08:21:26 +02:00
} elseif ( $instrow [ 'RegOpen' ] == 0 ) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'it doesn’ t allow registrations' ;
2020-10-13 08:21:26 +02:00
}
if ( is_null ( $instrow [ 'UserCount' ])) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'we don’ t know its total users number' ;
2020-10-13 08:21:26 +02:00
} elseif ( $instrow [ 'UserCount' ] < 10 || $instrow [ 'UserCount' ] > 30000 ) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'total users number is not greater than 10 and less than 30000' ;
2020-10-13 08:21:26 +02:00
}
if ( is_null ( $instrow [ 'DomainCount' ])) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'we don’ t know the number of other instances it knows' ;
2020-10-13 08:21:26 +02:00
} elseif ( $instrow [ 'DomainCount' ] < 500 ) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'the number of other instances it knows is less than 500' ;
2020-10-13 08:21:26 +02:00
}
if ( ! is_null ( $instrow [ 'ActiveUsersMonth' ])) {
if ( $instrow [ 'ActiveUsersMonth' ] < 10 )
2022-12-01 05:41:54 +01:00
$whynot [] = 'the number of active users for the last month is less than 10' ;
2020-10-22 17:54:05 +02:00
} elseif ( ! is_null ( $instrow [ 'StatusCount' ]) && $instrow [ 'UserCount' ] > 0 && $instrow [ 'StatusCount' ] / $instrow [ 'UserCount' ] < 10 ) {
2022-12-01 05:41:54 +01:00
$whynot [] = 'the average number of toots for user is less than 10' ;
2020-10-13 08:21:26 +02:00
} else {
2022-12-01 05:41:54 +01:00
$whynot [] = 'it was impossible to detect the number of active users for the last month or the average number of toots for user' ;
2020-10-13 08:21:26 +02:00
}
if ( count ( $whynot ) == 0 ) {
$instrow [ 'Good' ] = 1 ;
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $host . '»: this is a suitable instance! :-)' . N );
2020-10-13 08:21:26 +02:00
$qgood ++ ;
} else {
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $host . '»: this is not a suitable instance: ' . implode ( '; ' , $whynot ) . ' :-(' . N );
2020-10-13 08:21:26 +02:00
}
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT * FROM Instances WHERE URI=\'' . myesc ( $link , $instrow [ 'URI' ]) . '\'' , __LINE__ );
2020-10-13 08:21:26 +02:00
2022-12-01 05:41:54 +01:00
$nrows = mysqli_num_rows ( $res );
if ( $nrows == 1 ) {
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $instrow [ 'URI' ] . '»: is already present in the database, updating it...' . N );
2020-10-13 08:21:26 +02:00
$oldinstrow = mysqli_fetch_assoc ( $res );
$instid = $oldinstrow [ 'ID' ];
$instrow [ 'ID' ] = $oldinstrow [ 'ID' ];
2022-12-01 05:41:54 +01:00
// if the instance already present in the db has FirstSeen=NULL, this means this is the first time it responds, so...
2020-10-13 08:21:26 +02:00
if ( is_null ( $oldinstrow [ 'FirstSeen' ])) {
2022-12-01 05:41:54 +01:00
$instrow [ 'FirstSeen' ] = $now ;
2020-10-13 08:21:26 +02:00
$instrow [ 'New' ] = 1 ;
} else {
$instrow [ 'FirstSeen' ] = $oldinstrow [ 'FirstSeen' ];
2022-12-01 05:41:54 +01:00
if ( $oldinstrow [ 'New' ] == 1 ) {
$instrow [ 'New' ] = 1 ;
if ( $now - $oldinstrow [ 'FirstSeen' ] > $opts [ 'oldline' ]) {
$instrow [ 'New' ] = 0 ;
2022-12-09 19:25:44 +01:00
notify ( 'Instance «<a href="viewinst.php?id=' . $instrow [ 'ID' ] . '">' . $instrow [ 'URI' ] . '</a>» is no longer new.' , 2 );
2022-12-01 05:41:54 +01:00
}
}
2020-10-13 08:21:26 +02:00
}
2022-12-01 05:41:54 +01:00
2020-10-13 08:21:26 +02:00
if ( $instrow [ 'Good' ] == 1 && $oldinstrow [ 'Good' ] == 0 ) {
2022-12-01 05:41:54 +01:00
notify ( 'Instance «<a href="viewinst.php?id=' . $instrow [ 'ID' ] . '">' . $instrow [ 'URI' ] . '</a>» wasn’ t suitable, but it is now!' , 1 );
2020-10-13 08:21:26 +02:00
} elseif ( $instrow [ 'Good' ] == 0 && $oldinstrow [ 'Good' ] == 1 ) {
2022-12-01 05:41:54 +01:00
notify ( 'Instance «<a href="viewinst.php?id=' . $instrow [ 'ID' ] . '">' . $instrow [ 'URI' ] . '</a>» was suitable, but it’ s no longer for these reasons: ' . implode ( '; ' , $whynot ), 1 );
2020-10-13 08:21:26 +02:00
}
$instrow [ 'Chosen' ] = $oldinstrow [ 'Chosen' ];
$instrow [ 'Priority' ] = $oldinstrow [ 'Priority' ];
$instrow [ 'Visible' ] = $oldinstrow [ 'Visible' ];
2022-11-30 07:19:14 +01:00
$instrow [ 'Noxious' ] = $oldinstrow [ 'Noxious' ];
$instrow [ 'NoxReason' ] = $oldinstrow [ 'NoxReason' ];
$instrow [ 'NoxLastModTS' ] = $oldinstrow [ 'NoxLastModTS' ];
2020-10-13 08:21:26 +02:00
if ( $instrow [ 'ShortDesc' ] != $oldinstrow [ 'ShortDesc' ])
2022-12-01 05:41:54 +01:00
notify ( '«Short description» of instance «<a href="viewinst.php?id=' . $instrow [ 'ID' ] . '">' . $instrow [ 'URI' ] . '</a>» has changed.' , 1 );
2020-10-13 08:21:26 +02:00
if ( $instrow [ 'LongDesc' ] != $oldinstrow [ 'LongDesc' ])
2022-12-01 05:41:54 +01:00
notify ( '«Long description» of instance «<a href="viewinst.php?id=' . $instrow [ 'ID' ] . '">' . $instrow [ 'URI' ] . '</a>» has changed.' , 1 );
2020-10-13 08:21:26 +02:00
$instrow [ 'OurDesc' ] = $oldinstrow [ 'OurDesc' ];
$instrow [ 'OurDescEN' ] = $oldinstrow [ 'OurDescEN' ];
$instrow [ 'LocalityID' ] = $oldinstrow [ 'LocalityID' ];
$instrow [ 'OurLangsLock' ] = $oldinstrow [ 'OurLangsLock' ];
$instrow [ 'GuestID' ] = $oldinstrow [ 'GuestID' ];
$instrow [ 'LastGuestEdit' ] = $oldinstrow [ 'LastGuestEdit' ];
$query = 'UPDATE Instances SET ' ;
foreach ( $instrow as $field => $value ) {
if ( ! is_null ( $value ))
$query .= $field . '=\'' . myesc ( $link , $value ) . '\', ' ;
else
$query .= $field . '=NULL, ' ;
}
$query = substr ( $query , 0 , - 2 ) . ' WHERE Instances.ID=' . $instrow [ 'ID' ];
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $host . '»: update query: «' . $query . '».' . N );
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , $query , __LINE__ );
2020-10-13 08:21:26 +02:00
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT InstID, LangID, Pos, Code FROM InstLangs LEFT JOIN Languages ON Languages.ID=LangID WHERE InstID=' . $instrow [ 'ID' ] . ' ORDER BY Pos ASC' , __LINE__ );
2020-10-13 08:21:26 +02:00
$oldinstlangs = array ();
while ( $row = mysqli_fetch_assoc ( $res ))
$oldinstlangs [] = $row ;
$instlangs = langs ( $instrow [ 'ID' ], $instrow [ 'URI' ], false );
if ( $instlangs != $oldinstlangs ) {
2022-12-01 05:41:54 +01:00
notify ( 'The list of languages declared by instance «<a href="viewinst.php?id=' . $instrow [ 'ID' ] . '">' . $instrow [ 'URI' ] . '</a>» has changed from «' . subarimp ( ', ' , 'Code' , $oldinstlangs ) . '» to «' . subarimp ( ', ' , 'Code' , $instlangs ) . '».' , 1 );
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , 'DELETE FROM InstLangs WHERE InstID=' . $instrow [ 'ID' ], __LINE__ );
foreach ( $instlangs as $row )
myq ( $link , 'INSERT INTO InstLangs (InstID, LangID, Pos) VALUES (' . $row [ 'InstID' ] . ', ' . $row [ 'LangID' ] . ', ' . $row [ 'Pos' ] . ')' , __LINE__ );
2020-10-13 08:21:26 +02:00
}
}
if ( $instrow [ 'OurLangsLock' ] == 0 ) {
$instourlangs = langs ( $instrow [ 'ID' ], $instrow [ 'URI' ], true );
2022-12-01 05:41:54 +01:00
// if instourlangs is empty and instlangs is not, set instourlangs as instlangs
2020-10-13 08:21:26 +02:00
if ( count ( $instourlangs ) == 0 && count ( $instlangs ) > 0 )
$instourlangs = $instlangs ;
if ( count ( $instourlangs ) > 0 ) {
2022-12-01 05:41:54 +01:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , 'DELETE FROM InstOurLangs WHERE InstID=' . $instrow [ 'ID' ], __LINE__ );
foreach ( $instourlangs as $row )
myq ( $link , 'INSERT INTO InstOurLangs (InstID, OurLangID, Pos) VALUES (' . $row [ 'InstID' ] . ', ' . $row [ 'LangID' ] . ', ' . $row [ 'Pos' ] . ')' , __LINE__ );
2020-10-13 08:21:26 +02:00
}
}
}
2022-12-01 05:41:54 +01:00
} elseif ( $nrows == 0 ) {
2022-12-11 23:29:51 +01:00
eecho ( 1 , '«' . $host . '» is not present in the database, adding it...' . N );
2020-10-13 08:21:26 +02:00
$instrow [ 'FirstSeen' ] = $now ;
if ( $opts [ 'setnew' ])
$instrow [ 'New' ] = 1 ;
$fields = array ();
$values = '' ;
foreach ( $instrow as $field => $value ) {
$fields [] = $field ;
if ( ! is_null ( $value ))
$values .= '\'' . myesc ( $link , $value ) . '\', ' ;
else
$values .= 'NULL, ' ;
}
$values = substr ( $values , 0 , - 2 );
2022-12-11 23:29:51 +01:00
$query = 'INSERT INTO Instances (' . implode ( ', ' , $fields ) . ', InsertTS) VALUES (' . $values . ', ' . $now . ')' ;
2022-12-09 22:53:18 +01:00
eecho ( 1 , '«' . $host . '»: insert query: «' . $query . '»' . N );
2020-10-13 08:21:26 +02:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , $query , __LINE__ );
2020-10-13 08:21:26 +02:00
$instid = mysqli_insert_id ( $link );
} else {
$instid = 0 ;
}
2022-12-11 23:29:51 +01:00
if ( $opts [ 'setnew' ] && ! $opts [ 'dryrun' ])
2022-12-01 05:41:54 +01:00
notify ( 'New instance found: «<a href="viewinst.php?id=' . $instid . '">' . $instrow [ 'URI' ] . '</a>».' , 1 );
2020-10-13 08:21:26 +02:00
$instlangs = langs ( $instid , $instrow [ 'URI' ], false );
2022-12-11 23:29:51 +01:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
foreach ( $instlangs as $row )
myq ( $link , 'INSERT INTO InstLangs (InstID, LangID, Pos) VALUES (' . $row [ 'InstID' ] . ', ' . $row [ 'LangID' ] . ', ' . $row [ 'Pos' ] . ')' , __LINE__ );
2020-10-13 08:21:26 +02:00
}
$instourlangs = langs ( $instid , $instrow [ 'URI' ], true );
2022-12-01 05:41:54 +01:00
// if instourlangs is empty and instlangs is not, set instourlangs as instlangs
2020-10-13 08:21:26 +02:00
if ( count ( $instourlangs ) == 0 && count ( $instlangs ) > 0 )
$instourlangs = $instlangs ;
2022-12-01 05:41:54 +01:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
foreach ( $instourlangs as $row )
myq ( $link , 'INSERT INTO InstOurLangs (InstID, OurLangID, Pos) VALUES (' . $row [ 'InstID' ] . ', ' . $row [ 'LangID' ] . ', ' . $row [ 'Pos' ] . ')' , __LINE__ );
2020-10-13 08:21:26 +02:00
}
if ( $instrow [ 'Good' ] == 1 )
2022-12-01 05:41:54 +01:00
notify ( 'New instance «<a href="viewinst.php?id=' . $instid . '">' . $instrow [ 'URI' ] . '</a>» is suitable!' , 1 );
2020-10-13 08:21:26 +02:00
2022-12-01 05:41:54 +01:00
} else {
2022-12-11 23:29:51 +01:00
notify ( 'Instance «' . $host . '» has ' . $nrows . ' entries in «Instances» table!' , 3 );
2020-10-13 08:21:26 +02:00
}
if ( array_key_exists ( 'x-activity' , $info ) && is_array ( $info [ 'x-activity' ])) {
2022-12-11 23:29:51 +01:00
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , 'DELETE FROM InstActivity WHERE InstID=' . $instid , __LINE__ );
2022-12-11 23:29:51 +01:00
$pos = 0 ;
foreach ( $info [ 'x-activity' ] as $buf ) {
if ( akeavinn ( 'week' , $buf ) && akeavinn ( 'statuses' , $buf ) && akeavinn ( 'logins' , $buf ) && akeavinn ( 'registrations' , $buf )) {
$pos ++ ;
$query = 'INSERT INTO InstActivity (InstID, Week, Statuses, Logins, Registrations, Pos) VALUES (\'' . $instid . '\', \'' . myesc ( $link , $buf [ 'week' ]) . '\', \'' . myesc ( $link , $buf [ 'statuses' ]) . '\', \'' . myesc ( $link , $buf [ 'logins' ]) . '\', \'' . myesc ( $link , $buf [ 'registrations' ]) . '\', ' . $pos . ')' ;
2022-12-12 08:12:29 +01:00
myq ( $link , $query , __LINE__ );
2022-12-11 23:29:51 +01:00
}
2020-10-13 08:21:26 +02:00
}
}
}
if ( array_key_exists ( 'x-trends' , $info ) && is_array ( $info [ 'x-trends' ])) {
$trends = array ();
foreach ( $info [ 'x-trends' ] as $buf ) {
if ( akeavinn ( 'name' , $buf ) && akeavinn ( 'url' , $buf ) && akeavinn ( 'history' , $buf ) && is_array ( $buf [ 'history' ])) {
$trend = 0 ;
foreach ( $buf [ 'history' ] as $row ) {
if ( $row [ 'uses' ] > 0 )
$trend += ( $row [ 'accounts' ] / $row [ 'uses' ]);
}
$trends [] = array (
'InstID' => $instid ,
'LastDay' => $buf [ 'history' ][ 0 ][ 'day' ],
'Name' => $buf [ 'name' ],
'URL' => $buf [ 'url' ],
'Pos' => null ,
'trend' => $trend
);
}
}
mdasortbykey ( $trends , 'trend' , true );
//print_r($trends);
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'DELETE FROM InstTrends WHERE InstID=' . $instid , __LINE__ );
2020-10-13 08:21:26 +02:00
$pos = 0 ;
foreach ( $trends as $trend ) {
$pos ++ ;
$query = 'INSERT INTO InstTrends (InstID, LastDay, Name, URL, Pos) VALUES (' . $trend [ 'InstID' ] . ', \'' . $trend [ 'LastDay' ] . '\', \'' . myesc ( $link , truncs ( $trend [ 'Name' ], 'InstTrends' , 'Name' , '«' . $instrow [ 'URI' ] . '»' )) . '\', \'' . myesc ( $link , truncs ( $trend [ 'URL' ], 'InstTrends' , 'URL' , '«' . $instrow [ 'URI' ] . '»' )) . '\', ' . $pos . ')' ;
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , $query , __LINE__ );
2020-10-13 08:21:26 +02:00
}
}
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'INSERT INTO InstChecks (InstID, Time, Status) VALUES (' . $instid . ', ' . $now . ', 1)' , __LINE__ );
2022-12-11 23:29:51 +01:00
if ( $opts [ 'fetchusers' ] && $ismast && array_key_exists ( 'version' , $info ) && $info [ 'version' ] >= '4.0.0' ) {
eecho ( 0 , '«' . $host . '»: trying to fetch users info from directory API...' . N );
$exusers = []; // array of this instance's users already existing in the db
2022-12-12 08:12:29 +01:00
$res = myq ( $link , 'SELECT ID, locid, username FROM Users WHERE InstID=' . $instid , __LINE__ );
2022-12-11 23:29:51 +01:00
while ( $row = mysqli_fetch_assoc ( $res )) $exusers [ $row [ 'locid' ]] = $row ;
$users = []; // array of users in this instance's directory
$chunk = 0 ;
$limit = 80 ;
$end = false ;
while ( ! $end ) {
$offset = $chunk * $limit ;
2022-12-16 21:59:26 +01:00
$buf =@ gurl ( 'https://' . $host . '/api/v1/directory?local=1&order=new&limit=' . $limit . '&offset=' . $offset , $opts [ 'timeout' ]);
2022-12-11 23:29:51 +01:00
if ( $buf [ 'cont' ] !== false ) {
ckratelimit ( $buf [ 'headers' ]);
eecho ( 1 , '«' . $host . '»: got ' . ( $chunk + 1 ) . ' chunk(s) of users info from directory API :-)' . N );
$buf =@ json_decode ( $buf [ 'cont' ], true );
if ( is_array ( $buf )) {
//print_r($buf);
if ( count ( $buf ) < $limit ) $end = true ;
/* if ( count ( $buf ) > 0 && ! array_key_exists ( 'noindex' , $buf [ 0 ])) {
eecho ( 2 , '«' . $host . '»: account entities reported by directory api endpoint don’ t have a “noindex” attribute; skipping directory fetching.' . N );
break ;
} else {
eecho ( 0 , '«' . $host . '»: account entities reported by directory api endpoint do have a “noindex” attribute; continuing with directory fetching.' . N );
} */
//foreach ($buf as $user) echo($user['username'].' '); echo(N.N);
foreach ( $buf as $user ) {
if ( make ([ 'id' , 'username' , 'display_name' , 'locked' , 'bot' , 'discoverable' , 'created_at' , 'note' , 'url' , 'avatar' , 'header' , 'statuses_count' , 'last_status_at' , 'fields' , 'noindex' ], $user )) {
eecho ( 0 , '«' . $host . '» (' . $i . '/' . $cinsts . '): working on user «' . $user [ 'username' ] . '»...' . N );
// disabled because it takes too long on instances with many users
/* if ( ! isset ( $user [ 'noindex' ])) {
$user [ 'noindex' ] = true ;
eecho ( 0 , '«' . $host . '»: «' . $user [ 'username' ] . '»: «noindex» is undefined, trying to define it by fetching user’ s profile page...' . N );
2022-12-16 21:59:26 +01:00
$page = gurl ( $user [ 'url' ], $opts [ 'timeout' ]);
2022-12-11 23:29:51 +01:00
// here ckratelimit is not needed because it's a normal web page, not json from mastodon api
if ( $page [ 'cont' ] !== false ) {
//<meta content='noindex, noarchive' name='robots'>
if ( preg_match ( '/<meta\s+content=[\'"](noindex|noarchive)/ui' , $page [ 'cont' ]) !== 1 ) {
$user [ 'noindex' ] = false ;
eecho ( 0 , '«' . $user [ 'url' ] . '»: «noindex» is not set.' . N );
} else {
eecho ( 0 , '«' . $user [ 'url' ] . '»: «noindex» is set.' . N );
}
} else {
eecho ( 2 , '«' . $host . '»: could not fetch «' . $user [ 'url' ] . '»: ' . $page [ 'emsg' ] . N );
}
} */
$snote = strip_tags ( $user [ 'note' ]);
if ( preg_match ( '/(?<!\w)#(nobots?|noindex)(?!\w)/iu' , $snote ) === 1 ) $user [ 'noindex' ] = true ;
if ( preg_match ( '/(?<!\w)#(okindex|yesindex|doindex|okmhindex)(?!\w)/iu' , $snote ) === 1 ) $user [ 'noindex' ] = false ;
// disabled; takes too long on instances with many users
/* $user [ 'tags' ] = [];
if ( ! $user [ 'noindex' ] && $info [ 'version' ] >= '3.3.0' ) {
eecho ( 0 , '«' . $host . '»: trying to fetch tags for user «' . $user [ 'username' ] . '»...' . N );
2022-12-16 21:59:26 +01:00
$tags =@ gurl ( 'https://' . $host . '/api/v1/accounts/' . $user [ 'id' ] . '/featured_tags' , $opts [ 'timeout' ]);
2022-12-11 23:29:51 +01:00
if ( $tags [ 'cont' ] !== false ) {
ckratelimit ( $tags [ 'headers' ]);
$tags =@ json_decode ( $tags [ 'cont' ], true );
if ( is_array ( $tags ) && count ( $tags ) > 0 ) {
eecho ( 1 , '«' . $host . '»: got ' . count ( $tags ) . ' tag(s) for user «' . $user [ 'username' ] . '» :-)' . N );
foreach ( $tags as $tag ) $user [ 'tags' ][] = $tag [ 'name' ];
}
} else {
eecho ( 2 , '«' . $host . '»: could not fetch tags for user «' . $user [ 'username' ] . '» :-( (' . $tags [ 'emsg' ] . ').' . N );
}
}
$user [ 'tags' ] = implode ( ';' , $user [ 'tags' ]);
if ( $user [ 'tags' ] == '' ) $user [ 'tags' ] = null ; */
$user [ 'tags' ] = null ;
if ( ! is_null ( $user [ 'created_at' ])) $user [ 'created_at' ] = pgdatetomy ( $user [ 'created_at' ]);
if ( ! is_null ( $user [ 'last_status_at' ])) $user [ 'last_status_at' ] = datetomy ( $user [ 'last_status_at' ]);
$users [ $user [ 'id' ]] = $user ;
} else {
eecho ( 2 , '«' . $host . '»: user record missed some required keys :-(' . N );
//print_r($user);
}
}
} else {
eecho ( 2 , '«' . $host . '»: ... but the chunk was not good JSON :-(' . N );
$end = true ;
}
$chunk ++ ;
} else {
eecho ( 2 , '«' . $host . '»: could not fetch users info from directory API: ' . $buf [ 'emsg' ] . N );
$end = true ;
}
}
foreach ( $users as $locid => $user ) {
$query = 'SET InstID=' . $instid . ', host=' . myv ( $link , $host ) . ', locid=' . myv ( $link , $user [ 'id' ]) . ', username=' . myv ( $link , truncs ( $user [ 'username' ], 'Users' , 'username' , '«' . $host . '»: «' . $user [ 'username' ] . '»' )) . ', display_name=' . myv ( $link , truncs ( $user [ 'display_name' ], 'Users' , 'display_name' , '«' . $host . '»: «' . $user [ 'username' ] . '»' )) . ', locked=' . myv ( $link , $user [ 'locked' ]) . ', bot=' . myv ( $link , $user [ 'bot' ]) . ', created_at=' . myv ( $link , $user [ 'created_at' ]) . ', note=' . myv ( $link , truncs ( $user [ 'note' ], 'Users' , 'note' , '«' . $host . '»: «' . $user [ 'username' ] . '»' )) . ', url=' . myv ( $link , truncs ( $user [ 'url' ], 'Users' , 'url' , '«' . $host . '»: «' . $user [ 'username' ] . '»' )) . ', avatar=' . myv ( $link , truncs ( $user [ 'avatar' ], 'Users' , 'avatar' , '«' . $host . '»: «' . $user [ 'username' ] . '»' )) . ', header=' . myv ( $link , truncs ( $user [ 'header' ], 'Users' , 'header' , '«' . $host . '»: «' . $user [ 'username' ] . '»' )) . ', statuses_count=' . myv ( $link , $user [ 'statuses_count' ]) . ', last_status_at=' . myv ( $link , $user [ 'last_status_at' ]) . ', tags=' . myv ( $link , truncs ( $user [ 'tags' ], 'Users' , 'tags' , '«' . $host . '»: «' . $user [ 'username' ] . '»' ));
$uid = 0 ;
if ( ! array_key_exists ( $user [ 'id' ], $exusers )) {
if ( ! $user [ 'noindex' ]) {
eecho ( 0 , '«' . $host . '»: inserting new user «' . $user [ 'username' ] . '»...' . N );
$query = 'INSERT INTO Users ' . $query ;
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , $query , __LINE__ );
2022-12-11 23:29:51 +01:00
$uid = mysqli_insert_id ( $link );
}
} else {
eecho ( 0 , '«' . $host . '»: NOT inserting user «' . $user [ 'username' ] . '» because they don’ t want to be indexed...' . N );
}
} else {
$uid = $exusers [ $locid ][ 'ID' ];
if ( ! $user [ 'noindex' ]) {
eecho ( 0 , '«' . $host . '»: updating existing user «' . $user [ 'username' ] . '» (' . $uid . ')...' . N );
$query = 'UPDATE Users ' . $query . ' WHERE ID=' . $uid ;
} else {
eecho ( 0 , '«' . $host . '»: deleting existing user «' . $user [ 'username' ] . '» (' . $uid . ') because they don’ t want to be indexed...' . N );
$query = 'DELETE FROM Users WHERE ID=' . $uid ;
}
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , $query , __LINE__ );
myq ( $link , 'DELETE FROM UsersFields WHERE UserID=' . $uid , __LINE__ );
2022-12-11 23:29:51 +01:00
}
}
if ( $uid != 0 && ! $user [ 'noindex' ] && is_array ( $user [ 'fields' ]) && count ( $user [ 'fields' ]) > 0 ) {
eecho ( 0 , '«' . $host . '»: saving user fields for user «' . $user [ 'username' ] . '» (' . $uid . ')...' . N );
foreach ( $user [ 'fields' ] as $field ) {
( is_null ( $field [ 'verified_at' ])) ? $field [ 'verified_at' ] = 0 : $field [ 'verified_at' ] = 1 ;
$field [ 'name' ] = truncs ( $field [ 'name' ], 'UsersFields' , 'name' , '«' . $host . '»: «' . $user [ 'username' ] . '»' );
$field [ 'value' ] = truncs ( $field [ 'value' ], 'UsersFields' , 'value' , '«' . $host . '»: «' . $user [ 'username' ] . '»' );
2022-12-12 08:12:29 +01:00
if ( ! $opts [ 'dryrun' ]) myq ( $link , 'INSERT INTO UsersFields SET UserID=' . $uid . ', name=' . myv ( $link , $field [ 'name' ]) . ', value=' . myv ( $link , $field [ 'value' ]) . ', verified=' . $field [ 'verified_at' ], __LINE__ );
2022-12-11 23:29:51 +01:00
}
}
}
foreach ( $exusers as $locid => $exuser ) {
if ( ! array_key_exists ( $locid , $users )) {
eecho ( 0 , '«' . $host . '»: user «' . $exusers [ $locid ][ 'username' ] . '» opted out of the directory, deleting their record (' . $exuser [ 'ID' ] . ')...' . N );
if ( ! $opts [ 'dryrun' ]) {
2022-12-12 08:12:29 +01:00
myq ( $link , 'DELETE FROM Users WHERE ID=' . $exuser [ 'ID' ], __LINE__ );
myq ( $link , 'DELETE FROM UsersFields WHERE UserID=' . $exuser [ 'ID' ], __LINE__ );
2022-12-11 23:29:51 +01:00
}
}
}
}
2020-10-13 08:21:26 +02:00
}
}
}
mysqli_close ( $link );
2022-12-11 23:29:51 +01:00
unset ( $link );
2020-10-13 08:21:26 +02:00
if ( $opts [ 'jsonwrite' ]) {
2022-12-01 05:41:54 +01:00
fwrite ( $jsonf , '"The end?": true' . N . '}' . N );
2020-10-13 08:21:26 +02:00
fclose ( $jsonf );
}
unlink ( $instsjfp );
unlink ( $currinstjfp );
2020-10-14 08:37:41 +02:00
unlink ( $lockfp );
2020-10-13 08:21:26 +02:00
2022-12-11 23:29:51 +01:00
eecho ( 1 , 'Done (in ' . ght ( time () - $tini , null , 0 ) . ') :-)' . N );
2022-12-02 16:07:05 +01:00
2020-10-13 08:21:26 +02:00
exit ( 0 );
2022-12-08 00:03:10 +01:00
// "multi array_key_exists"
function make ( $keys , & $arr ) {
foreach ( $keys as $key )
if ( ! array_key_exists ( $key , $arr ))
return ( false );
return ( true );
}
function myv ( & $link , $var ) {
if ( is_null ( $var )) {
return ( 'NULL' );
} elseif ( is_bool ( $var )) {
if ( $var )
return ( '1' );
else
return ( '0' );
} elseif ( trim ( $var ) == '' ) {
return ( 'NULL' );
} else {
return ( '\'' . mysqli_real_escape_string ( $link , $var ) . '\'' );
}
}
function datetomy ( $date ) {
$date = explode ( '-' , $date );
return ( mktime ( 0 , 0 , 0 , $date [ 1 ], $date [ 2 ], $date [ 0 ]));
}
2022-12-11 23:29:51 +01:00
function ckratelimit ( $httpresphead ) {
2022-12-08 00:03:10 +01:00
$headers = explode ( " \r \n " , $httpresphead );
$buff = [];
array_shift ( $headers );
foreach ( $headers as $header )
if ( preg_match ( '/^([^:]+):(.*)$/Uu' , $header , $matches ) === 1 )
2022-12-09 22:53:18 +01:00
$buff [ strtolower ( $matches [ 1 ])] = trim ( $matches [ 2 ]);
2022-12-08 00:03:10 +01:00
$headers = $buff ;
if ( array_key_exists ( 'x-ratelimit-reset' , $headers )) {
2022-12-11 23:29:51 +01:00
if ( array_key_exists ( 'date' , $headers )) {
//Wed, 30 Mar 2022 21:27:22 GMT
$srvnow = strtotime ( $headers [ 'date' ]);
//2022-03-31T04:05:00.058705Z
$srvrlr = strtotime ( $headers [ 'x-ratelimit-reset' ]);
$stosl = $srvrlr - $srvnow + 1 ;
//echo('ckratelimit: x-ratelimit-remaining: '.$headers['x-ratelimit-remaining'].'; $srvnow: '.gmdate('c',$srvnow).'; $srvrlr: '.gmdate('c',$srvrlr).'; current time to sleep: '.$stosl.'.'.N);
if ( $headers [ 'x-ratelimit-remaining' ] < 3 ) {
eecho ( 2 , 'reached rate limit, sleeping for ' . $stosl . ' seconds ...' . N );
sleep ( $stosl );
}
} else {
eecho ( 2 , 'ckratelimit: $httpresphead did not contain a «date» header!' . N );
2022-12-08 00:03:10 +01:00
}
2022-12-11 23:29:51 +01:00
} else {
2022-12-08 00:03:10 +01:00
eecho ( 2 , 'ckratelimit: $httpresphead did not contain an «x-ratelimit-reset» header!' . N );
}
}
2020-10-13 08:21:26 +02:00
?>