peerscrawl.php 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #!/usr/bin/php
  2. <?php
  3. /*
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>.
  14. */
  15. setlocale(LC_ALL,getenv('LANG'));
  16. define('N',"\n");
  17. $opts=array(
  18. 'startinst'=>'mastodon.social',
  19. 'peersfp'=>'peers',
  20. 'restore'=>false,
  21. 'excludefp'=>null
  22. );
  23. $help='peerscrawl.php
  24. DESCRIPTION
  25. This program tries to build a fairly complete list of mastodon instances.
  26. SYNOPSIS
  27. peerscrawl.php [options]
  28. OPTIONS
  29. -s, --startinst <domain>
  30. Defines the first instance to crawl.
  31. DEFAULT: «'.$opts['startinst'].'»
  32. -p, --peersfp <file>
  33. Defines the file into which the ordered list of instances will be saved.
  34. DEFAULT: «'.$opts['peersfp'].'»
  35. -r, --restore
  36. If peers file already exists on program’s start it will be loaded into
  37. memory and each instance it contains will be considered “already
  38. crawled”, thus allowing to “restore an interrupted crawling session”.
  39. -e, --excludefp <file>
  40. Defines a file containing exclusion rules: one regular expression per
  41. line (empty lines are ignored). Any instance matching any defined regex
  42. will be ignored by the program. Changes made to this file during program
  43. execution will be taken into account.
  44. This program comes with ABSOLUTELY NO WARRANTY; for details see the source.
  45. This is free software, and you are welcome to redistribute it under
  46. certain conditions; see <http://www.gnu.org/licenses/> for details.'.N;
  47. for ($i=1; $i<$argc; $i++) {
  48. if (substr($argv[$i],0,1)=='-') {
  49. switch($argv[$i]) {
  50. case '-s':
  51. case '--startinst':
  52. if ($i+1>=$argc)
  53. mexit('Option «'.$argv[$i].'» has to be followed by a domain name (use «-h» for more info).'.N,1);
  54. $i++;
  55. $opts['startinst']=$argv[$i];
  56. break;
  57. case '-p':
  58. case '--peersfp':
  59. if ($i+1>=$argc)
  60. mexit('Option «'.$argv[$i].'» has to be followed by a file’s path (use «-h» for more info).'.N,1);
  61. $i++;
  62. $opts['peersfp']=$argv[$i];
  63. break;
  64. case '-r':
  65. case '--restore':
  66. $opts['restore']=true;
  67. $i++;
  68. break;
  69. case '-e':
  70. case '--excludefp':
  71. if ($i+1>=$argc || !file_exists($argv[$i+1]) || !is_file($argv[$i+1]) || !is_readable($argv[$i+1]))
  72. mexit('Option «'.$argv[$i].'» has to be followed by an existing, readable file’s path (use «-h» for more info).'.N,1);
  73. $i++;
  74. $opts['excludefp']=$argv[$i];
  75. break;
  76. case '-h':
  77. case '--help':
  78. mexit($help,0);
  79. break;
  80. default:
  81. mexit('Option «'.$argv[$i].'» is unknown (use «-h» for more info).'.N,1);
  82. break;
  83. }
  84. }
  85. }
  86. function mexit($msg,$code) {
  87. echo($msg);
  88. exit($code);
  89. }
  90. function shutdown($dosort) {
  91. global $opts, $peersf, $insts;
  92. if ($peersf) @fclose($peersf);
  93. if ($dosort) {
  94. echo('Saving ordered instances list into «'.$opts['peersfp'].'».'.N);
  95. sort($insts);
  96. $peersf=@fopen($opts['peersfp'],'w');
  97. if ($peersf!==false) {
  98. foreach ($insts as $inst)
  99. fwrite($peersf,$inst.N);
  100. fclose($peersf);
  101. } else {
  102. echo('Couldn’t open «'.$opts['peersfp'].'» for writing.'.N);
  103. }
  104. }
  105. }
  106. declare(ticks=1);
  107. function signalHandler($signal) {
  108. echo(N.'I got interrupted (signal: '.$signal.').'.N);
  109. shutdown(false);
  110. exit(2);
  111. }
  112. pcntl_signal(SIGTERM,'signalHandler');// Termination ('kill' was called)
  113. pcntl_signal(SIGHUP,'signalHandler');// Terminal log-out
  114. pcntl_signal(SIGINT,'signalHandler');// Interrupted (Ctrl-C is pressed)
  115. $contextopts=array(
  116. 'http'=>array(
  117. 'timeout'=>3
  118. ),
  119. 'socket'=>array(
  120. 'tcp_nodelay'=>true
  121. )
  122. );
  123. $context=stream_context_create($contextopts);
  124. $insts=array();
  125. $exarr=array();
  126. if ($opts['restore']) {
  127. if (file_exists($opts['peersfp']) && is_file($opts['peersfp']) && is_readable($opts['peersfp'])) {
  128. echo('Loading «'.$opts['peersfp'].'».'.N);
  129. $insts=file($opts['peersfp'],FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
  130. } else {
  131. echo('WARNING: I couldn’t open «'.$opts['peersfp'].'» for reading.'.N);
  132. }
  133. }
  134. $peersf=@fopen($opts['peersfp'],'w');
  135. function isempty($val) {
  136. if (preg_match('/^\s*$/',$val)===1)
  137. return(true);
  138. else
  139. return(false);
  140. }
  141. function waituntilonline() {
  142. global $context;
  143. $url='www.google.com';
  144. while (false===($f=@fsockopen($url,80,$errno,$errstr,1))) {
  145. echo(strftime('%c').' - WARNING: it seems we are offline :-('.N);
  146. sleep(5);
  147. }
  148. fclose($f);
  149. // echo(strftime('%c').' - it seems we are online! :-)'.N);
  150. }
  151. function updexarr() {
  152. global $exarr, $opts;
  153. if (!is_null($opts['excludefp'])) {
  154. $f=@fopen($opts['excludefp'],'r');
  155. if ($f!==false) {
  156. $i=0;
  157. $exarr=array();
  158. while (!feof($f)) {
  159. $i++;
  160. $line=trim(fgets($f));
  161. if (!isempty($line)) {
  162. if (@preg_match($line,'foo')!==false)
  163. $exarr[]=$line;
  164. else
  165. echo('WARNING: «'.$opts['excludefp'].'», line '.$i.': «'.$line.'» is not a valid regular expression.'.N);
  166. }
  167. }
  168. } else {
  169. echo('WARNING: I could not open «'.$opts['excludefp'].'» for reading.'.N);
  170. }
  171. }
  172. }
  173. function ckexarr($inst) {
  174. global $exarr;
  175. foreach ($exarr as $re)
  176. if (preg_match($re,$inst)===1) return(true);
  177. return(false);
  178. }
  179. function crawl($inst) {
  180. global $insts, $peersf, $context;
  181. waituntilonline();
  182. updexarr();
  183. if (!isempty($inst)) {
  184. if (!in_array($inst,$insts)) {
  185. echo('«'.$inst.'» is not a known instance, I add it to the list of known instances.'.N);
  186. $insts[]=$inst;
  187. fwrite($peersf,$inst.N);
  188. }
  189. echo('«'.$inst.'»: trying to load instance’s peers ... ');
  190. $peers=@file_get_contents('https://'.$inst.'/api/v1/instance/peers',false,$context);
  191. if ($peers!=false) {
  192. echo('OK :-)'.N);
  193. $peers=json_decode($peers,true);
  194. if (is_array($peers)) {
  195. foreach ($peers as $peer) {
  196. if (is_string($peer)) {
  197. if (!ckexarr($peer)) {
  198. if (!in_array($peer,$insts)) {
  199. echo('>>> I will crawl «'.$peer.'».'.N);
  200. crawl($peer);
  201. }/* else {
  202. echo('>>> I won’t crawl «'.$peer.'» because I already did.'.N);
  203. }*/
  204. } else {
  205. echo('>>> I won’t crawl «'.$peer.'» because its name matches with an exclusion regex.'.N);
  206. }
  207. } else {
  208. echo('>>> I won’t crawl this peer because its name is not a string.'.N);
  209. }
  210. }
  211. }
  212. } else {
  213. echo('ERROR :-('.N);
  214. }
  215. } else {
  216. echo('I WON’T add nameless instances.'.N);
  217. }
  218. echo('~~~~~~~ Stats: '.count($insts).' known istances ~~~~~~~'.N);
  219. }
  220. crawl($opts['startinst']);
  221. echo('DONE CRAWLING! :-)'.N);
  222. shutdown(true);
  223. exit(0);
  224. ?>