gettext-repubblica/get_rep_text.py
2018-07-02 18:55:05 +02:00

22 lines
629 B
Python
Executable file

#!/usr/bin/env python
#from: https://paste.debian.net/hidden/55e325f5/
import lxml.html
import requests
import re
import sys
import subprocess
import tempfile
def get_rep_text(url):
text_url = re.sub(r'/pwa/', '/ws/detail/', url)
page = requests.get(text_url)
root = lxml.html.fromstring(page.text)
print(page.text)
return ''.join(
div.text_content() for div in root.xpath('//div[@class="paywall"]'))
if __name__ == '__main__':
with tempfile.NamedTemporaryFile(delete=False) as tmpf:
tmpf.write(get_rep_text(sys.argv[1]).encode('utf-8'))
subprocess.call(['firefox', tmpf.name])