extract.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. '''
  2. Extract relevant informations from URL.
  3. Most of the code comes from jarun/buku, licensed under GPLv3.
  4. '''
  5. import os
  6. import certifi
  7. import cgi
  8. from logging import getLogger
  9. import re
  10. import urllib3
  11. from urllib3.exceptions import LocationParseError
  12. from urllib3.util import parse_url, make_headers
  13. from bs4 import BeautifulSoup
  14. logger = getLogger()
  15. MYHEADERS = None # Default dictionary of headers
  16. USER_AGENT = (
  17. "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"
  18. )
  19. SKIP_MIMES = {".pdf", ".txt"}
  20. def parse_decoded_page(page):
  21. """Fetch title, description and keywords from decoded HTML page.
  22. Parameters
  23. ----------
  24. page : str
  25. Decoded HTML page.
  26. Returns
  27. -------
  28. tuple
  29. (title, description, keywords).
  30. """
  31. title = None
  32. desc = None
  33. keys = None
  34. soup = BeautifulSoup(page, "html5lib")
  35. try:
  36. title = soup.find("title").text.strip().replace("\n", " ")
  37. if title:
  38. title = re.sub(r"\s{2,}", " ", title)
  39. except Exception as e:
  40. logger.debug(e)
  41. description = (
  42. soup.find("meta", attrs={"name": "description"})
  43. or soup.find("meta", attrs={"name": "Description"})
  44. or soup.find("meta", attrs={"property": "description"})
  45. or soup.find("meta", attrs={"property": "Description"})
  46. or soup.find("meta", attrs={"name": "og:description"})
  47. or soup.find("meta", attrs={"name": "og:Description"})
  48. or soup.find("meta", attrs={"property": "og:description"})
  49. or soup.find("meta", attrs={"property": "og:Description"})
  50. )
  51. try:
  52. if description:
  53. desc = description.get("content").strip()
  54. if desc:
  55. desc = re.sub(r"\s{2,}", " ", desc)
  56. except Exception as e:
  57. logger.debug(e)
  58. keywords = soup.find("meta", attrs={"name": "keywords"}) or soup.find(
  59. "meta", attrs={"name": "Keywords"}
  60. )
  61. try:
  62. if keywords:
  63. keys = keywords.get("content").strip().replace("\n", " ")
  64. keys = re.sub(r"\s{2,}", " ", keys)
  65. if is_unusual_tag(keys):
  66. if keys not in (title, desc):
  67. logger.debug("keywords to description: %s", keys)
  68. if desc:
  69. desc = desc + "\n## " + keys
  70. else:
  71. desc = "* " + keys
  72. keys = None
  73. except Exception as e:
  74. logger.debug(e)
  75. logger.debug("title: %s", title)
  76. logger.debug("desc : %s", desc)
  77. logger.debug("keys : %s", keys)
  78. return (title, desc, keys)
  79. def get_data_from_page(resp):
  80. """Detect HTTP response encoding and invoke parser with decoded data.
  81. Parameters
  82. ----------
  83. resp : HTTP response
  84. Response from GET request.
  85. Returns
  86. -------
  87. tuple
  88. (title, description, keywords).
  89. """
  90. try:
  91. soup = BeautifulSoup(resp.data, "html.parser")
  92. except Exception as e:
  93. logger.error("get_data_from_page(): %s", e)
  94. try:
  95. charset = None
  96. if soup.meta and soup.meta.get("charset") is not None:
  97. charset = soup.meta.get("charset")
  98. elif "content-type" in resp.headers:
  99. _, params = cgi.parse_header(resp.headers["content-type"])
  100. if params.get("charset") is not None:
  101. charset = params.get("charset")
  102. if not charset and soup:
  103. meta_tag = soup.find("meta", attrs={"http-equiv": "Content-Type"})
  104. if meta_tag:
  105. _, params = cgi.parse_header(meta_tag.attrs["content"])
  106. charset = params.get("charset", charset)
  107. if charset:
  108. logger.debug("charset: %s", charset)
  109. title, desc, keywords = parse_decoded_page(
  110. resp.data.decode(charset, errors="replace")
  111. )
  112. else:
  113. title, desc, keywords = parse_decoded_page(
  114. resp.data.decode(errors="replace")
  115. )
  116. return (title, desc, keywords)
  117. except Exception as e:
  118. logger.error(e)
  119. return (None, None, None)
  120. def get_PoolManager(MYPROXY=None):
  121. """Creates a pool manager with proxy support, if applicable.
  122. Returns
  123. -------
  124. ProxyManager or PoolManager
  125. ProxyManager if https_proxy is defined, PoolManager otherwise.
  126. """
  127. if MYPROXY:
  128. return urllib3.ProxyManager(
  129. MYPROXY,
  130. num_pools=1,
  131. headers=MYHEADERS,
  132. timeout=15,
  133. cert_reqs="CERT_REQUIRED",
  134. ca_certs=certifi.where(),
  135. )
  136. return urllib3.PoolManager(
  137. num_pools=1,
  138. headers=MYHEADERS,
  139. timeout=15,
  140. cert_reqs="CERT_REQUIRED",
  141. ca_certs=certifi.where(),
  142. )
  143. def network_handler(url, http_head=False):
  144. """Handle server connection and redirections.
  145. Parameters
  146. ----------
  147. url : str
  148. URL to fetch.
  149. http_head : bool
  150. If True, send only HTTP HEAD request. Default is False.
  151. Returns
  152. -------
  153. tuple
  154. (title, description, tags, recognized mime, bad url).
  155. """
  156. page_title = None
  157. page_desc = None
  158. page_keys = None
  159. exception = False
  160. if is_nongeneric_url(url) or is_bad_url(url):
  161. return (None, None, None, 0, 1)
  162. if is_ignored_mime(url) or http_head:
  163. method = "HEAD"
  164. else:
  165. method = "GET"
  166. if not MYHEADERS:
  167. gen_headers()
  168. try:
  169. manager = get_PoolManager()
  170. while True:
  171. resp = manager.request(method, url)
  172. if resp.status == 200:
  173. if method == "GET":
  174. page_title, page_desc, page_keys = get_data_from_page(resp)
  175. elif resp.status == 403 and url.endswith("/"):
  176. # HTTP response Forbidden
  177. # Handle URLs in the form of https://www.domain.com/
  178. # which fail when trying to fetch resource '/'
  179. # retry without trailing '/'
  180. logger.debug("Received status 403: retrying...")
  181. # Remove trailing /
  182. url = url[:-1]
  183. resp.close()
  184. continue
  185. else:
  186. logger.error("[%s] %s", resp.status, resp.reason)
  187. if resp:
  188. resp.close()
  189. break
  190. except Exception as e:
  191. logger.error("network_handler(): %s", e)
  192. exception = True
  193. finally:
  194. if manager:
  195. manager.clear()
  196. if exception:
  197. return (None, None, None, 0, 0)
  198. if method == "HEAD":
  199. return ("", "", "", 1, 0)
  200. if page_title is None:
  201. return ("", page_desc, page_keys, 0, 0)
  202. return (page_title, page_desc, page_keys, 0, 0)
  203. def is_bad_url(url):
  204. """Check if URL is malformed.
  205. .. Note:: This API is not bulletproof but works in most cases.
  206. Parameters
  207. ----------
  208. url : str
  209. URL to scan.
  210. Returns
  211. -------
  212. bool
  213. True if URL is malformed, False otherwise.
  214. """
  215. # Get the netloc token
  216. try:
  217. netloc = parse_url(url).netloc
  218. except LocationParseError as e:
  219. logger.error("%s, URL: %s", e, url)
  220. return True
  221. if not netloc:
  222. # Try of prepend '//' and get netloc
  223. netloc = parse_url("//" + url).netloc
  224. if not netloc:
  225. return True
  226. logger.debug("netloc: %s", netloc)
  227. # netloc cannot start or end with a '.'
  228. if netloc.startswith(".") or netloc.endswith("."):
  229. return True
  230. # netloc should have at least one '.'
  231. if netloc.rfind(".") < 0:
  232. return True
  233. return False
  234. def is_nongeneric_url(url):
  235. """Returns True for URLs which are non-http and non-generic.
  236. Parameters
  237. ----------
  238. url : str
  239. URL to scan.
  240. Returns
  241. -------
  242. bool
  243. True if URL is a non-generic URL, False otherwise.
  244. """
  245. ignored_prefix = ["about:", "apt:", "chrome://", "file://", "place:"]
  246. for prefix in ignored_prefix:
  247. if url.startswith(prefix):
  248. return True
  249. return False
  250. def is_unusual_tag(tagstr):
  251. """Identify unusual tags with word to comma ratio > 3.
  252. Parameters
  253. ----------
  254. tagstr : str
  255. tag string to check.
  256. Returns
  257. -------
  258. bool
  259. True if valid tag else False.
  260. """
  261. if not tagstr:
  262. return False
  263. nwords = len(tagstr.split())
  264. ncommas = tagstr.count(",") + 1
  265. if nwords / ncommas > 3:
  266. return True
  267. return False
  268. def is_ignored_mime(url):
  269. """Check if URL links to ignored MIME.
  270. .. Note:: Only a 'HEAD' request is made for these URLs.
  271. Parameters
  272. ----------
  273. url : str
  274. URL to scan.
  275. Returns
  276. -------
  277. bool
  278. True if URL links to ignored MIME, False otherwise.
  279. """
  280. for mime in SKIP_MIMES:
  281. if url.lower().endswith(mime):
  282. logger.debug("matched MIME: %s", mime)
  283. return True
  284. return False
  285. def gen_headers():
  286. """Generate headers for network connection."""
  287. global MYHEADERS, MYPROXY
  288. MYHEADERS = {
  289. "Accept-Encoding": "gzip,deflate",
  290. "User-Agent": USER_AGENT,
  291. "Accept": "*/*",
  292. "Cookie": "",
  293. "DNT": "1",
  294. }
  295. MYPROXY = os.environ.get("https_proxy")
  296. if MYPROXY:
  297. try:
  298. url = parse_url(MYPROXY)
  299. except Exception as e:
  300. logger.error(e)
  301. return
  302. # Strip username and password (if present) and update headers
  303. if url.auth:
  304. MYPROXY = MYPROXY.replace(url.auth + "@", "")
  305. auth_headers = make_headers(basic_auth=url.auth)
  306. MYHEADERS.update(auth_headers)
  307. logger.debug("proxy: [%s]", MYPROXY)