123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- import os
- import re
- import io
- import json
- import shutil
- import urllib
- import smtplib
- from email.mime.text import MIMEText
- import logging
- import datetime
- import requests
- import subprocess
- import multiprocessing
- from lxml import etree
- from xml.etree import ElementTree
- from tornado.ioloop import IOLoop
- from apscheduler.triggers.cron import CronTrigger
- from apscheduler.schedulers.tornado import TornadoScheduler
- from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
- import tornado.httpserver
- import tornado.ioloop
- import tornado.options
- from tornado.options import define, options
- import tornado.web
- from tornado import gen, escape
- JOBS_STORE = 'sqlite:///conf/jobs.db'
- API_VERSION = '1.0'
- SCHEDULES_FILE = 'conf/schedules.json'
- DEFAULT_CONF = 'conf/diffido.conf'
- EMAIL_FROM = 'diffido@localhost'
- GIT_CMD = 'git'
- re_insertion = re.compile(r'(\d+) insertion')
- re_deletion = re.compile(r'(\d+) deletion')
- logger = logging.getLogger()
- logger.setLevel(logging.INFO)
- def read_schedules():
- if not os.path.isfile(SCHEDULES_FILE):
- return {'schedules': {}}
- try:
- with open(SCHEDULES_FILE, 'r') as fd:
- return json.loads(fd.read())
- except Exception as e:
- logger.error('unable to read %s: %s' % (SCHEDULES_FILE, e))
- return {'schedules': {}}
- def write_schedules(schedules):
- with open(SCHEDULES_FILE, 'w') as fd:
- fd.write(json.dumps(schedules, indent=2))
- def next_id(schedules):
- ids = schedules.get('schedules', {}).keys()
- if not ids:
- return '1'
- return str(max([int(i) for i in ids]) + 1)
- def get_schedule(id_, add_id=True):
- schedules = read_schedules()
- data = schedules.get('schedules', {}).get(id_, {})
- if add_id:
- data['id'] = str(id_)
- return data
- def select_xpath(content, xpath):
- fd = io.StringIO(content)
- tree = etree.parse(fd)
- elems = tree.xpath(xpath)
- if not elems:
- return content
- selected_content = []
- for elem in elems:
- selected_content.append(''.join([elem.text] + [ElementTree.tostring(e).decode('utf-8', 'replace')
- for e in elem.getchildren()]))
- content = ''.join(selected_content)
- return content
- def run_job(id_=None, *args, **kwargs):
- schedule = get_schedule(id_, add_id=False)
- url = schedule.get('url')
- if not url:
- return
- logger.debug('Running job id:%s title:%s url: %s' % (id_, schedule.get('title', ''), url))
- req = requests.get(url, allow_redirects=True, timeout=(30.10, 240))
- content = req.text
- xpath = schedule.get('xpath')
- if xpath:
- content = select_xpath(content, xpath)
- req_path = urllib.parse.urlparse(req.url).path
- base_name = os.path.basename(req_path) or 'index.html'
- def _commit(id_, filename, content, queue):
- os.chdir('storage/%s' % id_)
- current_lines = 0
- if os.path.isfile(filename):
- with open(filename, 'r') as fd:
- for line in fd:
- current_lines += 1
- with open(filename, 'w') as fd:
- fd.write(content)
- p = subprocess.Popen([GIT_CMD, 'add', filename])
- p.communicate()
- p = subprocess.Popen([GIT_CMD, 'commit', '-m', '%s' % datetime.datetime.utcnow(), '--allow-empty'],
- stdout=subprocess.PIPE)
- stdout, _ = p.communicate()
- stdout = stdout.decode('utf-8')
- insert = re_insertion.findall(stdout)
- if insert:
- insert = int(insert[0])
- else:
- insert = 0
- delete = re_deletion.findall(stdout)
- if delete:
- delete = int(delete[0])
- else:
- delete = 0
- queue.put({'insertions': insert, 'deletions': delete, 'previous_lines': current_lines,
- 'changes': max(insert, delete)})
- queue = multiprocessing.Queue()
- p = multiprocessing.Process(target=_commit, args=(id_, base_name, content, queue))
- p.start()
- res = queue.get()
- p.join()
- email = schedule.get('email')
- if not email:
- return
- changes = res.get('changes')
- if not changes:
- return
- min_change = schedule.get('minimum_change')
- previous_lines = res.get('previous_lines')
- if min_change and previous_lines:
- min_change = float(min_change)
- change_fraction = res.get('changes') / previous_lines
- if change_fraction < min_change:
- return
- # send notification
- diff = get_diff(id_, {}).get('diff')
- if not diff:
- return
- send_email(to=email, subject='%s page changed' % schedule.get('title'),
- body='changes:\n\n%s' % diff)
- def safe_run_job(id_=None, *args, **kwargs):
- try:
- run_job(id_, *args, **kwargs)
- except Exception as e:
- send_email('error executing job %s: %s' % (id_, e))
- def send_email(to, subject='diffido', body='', from_=None):
- msg = MIMEText(body)
- msg['Subject'] = subject
- msg['From'] = from_ or EMAIL_FROM
- msg['To'] = to
- s = smtplib.SMTP('localhost')
- s.send_message(msg)
- s.quit()
- def get_history(id_):
- def _history(id_, queue):
- os.chdir('storage/%s' % id_)
- p = subprocess.Popen([GIT_CMD, 'log', '--pretty=oneline', '--shortstat'], stdout=subprocess.PIPE)
- stdout, _ = p.communicate()
- queue.put(stdout)
- queue = multiprocessing.Queue()
- p = multiprocessing.Process(target=_history, args=(id_, queue))
- p.start()
- res = queue.get().decode('utf-8')
- p.join()
- history = []
- res_io = io.StringIO(res)
- while True:
- commit_line = res_io.readline().strip()
- if not commit_line:
- break
- commit_id, message = commit_line.split(' ', 1)
- if len(commit_id) != 40:
- continue
- changes_line = res_io.readline().strip()
- insert = re_insertion.findall(changes_line)
- if insert:
- insert = int(insert[0])
- else:
- insert = 0
- delete = re_deletion.findall(changes_line)
- if delete:
- delete = int(delete[0])
- else:
- delete = 0
- history.append({'id': commit_id, 'message': message, 'insertions': insert, 'deletions': delete,
- 'changes': max(insert, delete)})
- lastid = None
- if history and 'id' in history[0]:
- lastid = history[0]['id']
- for idx, item in enumerate(history):
- item['seq'] = idx + 1
- return {'history': history, 'lastid': lastid}
- def get_diff(id_, commit_id='HEAD', old_commit_id=None):
- logger.warn('MEHHHH %s %s %s' % (id_, commit_id, old_commit_id))
- def _history(id_, commit_id, old_commit_id, queue):
- os.chdir('storage/%s' % id_)
- p = subprocess.Popen([GIT_CMD, 'diff', old_commit_id or '%s~' % commit_id, commit_id],
- stdout=subprocess.PIPE)
- stdout, _ = p.communicate()
- queue.put(stdout)
- queue = multiprocessing.Queue()
- p = multiprocessing.Process(target=_history, args=(id_, commit_id, old_commit_id, queue))
- p.start()
- res = queue.get().decode('utf-8')
- p.join()
- return {'diff': res}
- def scheduler_update(scheduler, id_):
- schedule = get_schedule(id_, add_id=False)
- if not schedule:
- return
- trigger = schedule.get('trigger')
- if trigger not in ('interval', 'cron'):
- return
- args = {}
- if trigger == 'interval':
- args['trigger'] = 'interval'
- for unit in 'weeks', 'days', 'hours', 'minutes', 'seconds':
- if 'interval_%s' % unit not in schedule:
- continue
- args[unit] = int(schedule['interval_%s' % unit])
- elif trigger == 'cron':
- cron_trigger = CronTrigger.from_crontab(schedule.get('cron_crontab'))
- args['trigger'] = cron_trigger
- git_create_repo(id_)
- scheduler.add_job(safe_run_job, id=id_, replace_existing=True, kwargs={'id_': id_}, **args)
- def scheduler_delete(scheduler, id_):
- scheduler.remove_job(job_id=id_)
- git_delete_repo(id_)
- def reset_from_schedules(scheduler):
- scheduler.remove_all_jobs()
- for key in read_schedules().get('schedules', {}).keys():
- scheduler_update(scheduler, id_=key)
- def git_create_repo(id_):
- repo_dir = 'storage/%s' % id_
- if os.path.isdir(repo_dir):
- return
- p = subprocess.Popen([GIT_CMD, 'init', repo_dir])
- p.communicate()
- def git_delete_repo(id_):
- repo_dir = 'storage/%s' % id_
- if not os.path.isdir(repo_dir):
- return
- shutil.rmtree(repo_dir)
- class DiffidoBaseException(Exception):
- """Base class for diffido custom exceptions.
- :param message: text message
- :type message: str
- :param status: numeric http status code
- :type status: int"""
- def __init__(self, message, status=400):
- super(DiffidoBaseException, self).__init__(message)
- self.message = message
- self.status = status
- class BaseHandler(tornado.web.RequestHandler):
- """Base class for request handlers."""
- # Cache currently connected users.
- _users_cache = {}
- # set of documents we're managing (a collection in MongoDB or a table in a SQL database)
- document = None
- collection = None
- # A property to access the first value of each argument.
- arguments = property(lambda self: dict([(k, v[0].decode('utf-8'))
- for k, v in self.request.arguments.items()]))
- @property
- def clean_body(self):
- """Return a clean dictionary from a JSON body, suitable for a query on MongoDB.
- :returns: a clean copy of the body arguments
- :rtype: dict"""
- return escape.json_decode(self.request.body or '{}')
- def write_error(self, status_code, **kwargs):
- """Default error handler."""
- if isinstance(kwargs.get('exc_info', (None, None))[1], DiffidoBaseException):
- exc = kwargs['exc_info'][1]
- status_code = exc.status
- message = exc.message
- else:
- message = 'internal error'
- self.build_error(message, status=status_code)
- def is_api(self):
- """Return True if the path is from an API call."""
- return self.request.path.startswith('/v%s' % API_VERSION)
- def initialize(self, **kwargs):
- """Add every passed (key, value) as attributes of the instance."""
- for key, value in kwargs.items():
- setattr(self, key, value)
- def build_error(self, message='', status=400):
- """Build and write an error message.
- :param message: textual message
- :type message: str
- :param status: HTTP status code
- :type status: int
- """
- self.set_status(status)
- self.write({'error': True, 'message': message})
- def build_success(self, message='', status=200):
- """Build and write a success message.
- :param message: textual message
- :type message: str
- :param status: HTTP status code
- :type status: int
- """
- self.set_status(status)
- self.write({'error': False, 'message': message})
- class SchedulesHandler(BaseHandler):
- @gen.coroutine
- def get(self, id_=None, *args, **kwargs):
- if id_ is not None:
- self.write({'schedule': get_schedule(id_)})
- return
- schedules = read_schedules()
- self.write(schedules)
- @gen.coroutine
- def put(self, id_=None, *args, **kwargs):
- if id_ is None:
- return self.build_error(message='update action requires an ID')
- data = self.clean_body
- schedules = read_schedules()
- if id_ not in schedules.get('schedules', {}):
- return self.build_error(message='schedule %s not found' % id_)
- schedules['schedules'][id_] = data
- write_schedules(schedules)
- scheduler_update(scheduler=self.scheduler, id_=id_)
- self.write(get_schedule(id_=id_))
- @gen.coroutine
- def post(self, *args, **kwargs):
- data = self.clean_body
- schedules = read_schedules()
- id_ = next_id(schedules)
- schedules['schedules'][id_] = data
- write_schedules(schedules)
- scheduler_update(scheduler=self.scheduler, id_=id_)
- self.write(get_schedule(id_=id_))
- @gen.coroutine
- def delete(self, id_=None, *args, **kwargs):
- if id_ is None:
- return self.build_error(message='an ID must be specified')
- schedules = read_schedules()
- if id_ in schedules.get('schedules', {}):
- del schedules['schedules'][id_]
- write_schedules(schedules)
- scheduler_delete(scheduler=self.scheduler, id_=id_)
- self.build_success(message='removed schedule %s' % id_)
- class ResetSchedulesHandler(BaseHandler):
- @gen.coroutine
- def post(self, *args, **kwargs):
- reset_from_schedules(self.scheduler)
- class HistoryHandler(BaseHandler):
- @gen.coroutine
- def get(self, id_, *args, **kwargs):
- self.write(get_history(id_))
- class DiffHandler(BaseHandler):
- @gen.coroutine
- def get(self, id_, commit_id, old_commit_id=None, *args, **kwargs):
- self.write(get_diff(id_, commit_id, old_commit_id))
- class TemplateHandler(BaseHandler):
- """Handler for the / path."""
- app_path = os.path.join(os.path.dirname(__file__), "dist")
- @gen.coroutine
- def get(self, *args, **kwargs):
- page = 'index.html'
- if args and args[0]:
- page = args[0].strip('/')
- arguments = self.arguments
- self.render(page, **arguments)
- def serve():
- global EMAIL_FROM
- jobstores = {'default': SQLAlchemyJobStore(url=JOBS_STORE)}
- scheduler = TornadoScheduler(jobstores=jobstores)
- scheduler.start()
- define('port', default=3210, help='run on the given port', type=int)
- define('address', default='', help='bind the server at the given address', type=str)
- define('ssl_cert', default=os.path.join(os.path.dirname(__file__), 'ssl', 'diffido_cert.pem'),
- help='specify the SSL certificate to use for secure connections')
- define('ssl_key', default=os.path.join(os.path.dirname(__file__), 'ssl', 'diffido_key.pem'),
- help='specify the SSL private key to use for secure connections')
- define('admin-email', default='', help='email address of the site administrator', type=str)
- define('debug', default=False, help='run in debug mode')
- define('config', help='read configuration file',
- callback=lambda path: tornado.options.parse_config_file(path, final=False))
- if not options.config and os.path.isfile(DEFAULT_CONF):
- tornado.options.parse_config_file(DEFAULT_CONF, final=False)
- tornado.options.parse_command_line()
- if options.admin_email:
- EMAIL_FROM = options.admin_email
- if options.debug:
- logger.setLevel(logging.DEBUG)
- ssl_options = {}
- if os.path.isfile(options.ssl_key) and os.path.isfile(options.ssl_cert):
- ssl_options = dict(certfile=options.ssl_cert, keyfile=options.ssl_key)
- init_params = dict(listen_port=options.port, logger=logger, ssl_options=ssl_options,
- scheduler=scheduler)
- _reset_schedules_path = r'schedules/reset'
- _schedules_path = r'schedules/?(?P<id_>\d+)?'
- _history_path = r'history/?(?P<id_>\d+)'
- _diff_path = r'diff/(?P<id_>\d+)/(?P<commit_id>[0-9a-f]+)/?(?P<old_commit_id>[0-9a-f]+)?/?'
- application = tornado.web.Application([
- (r'/api/%s' % _reset_schedules_path, ResetSchedulesHandler, init_params),
- (r'/api/v%s/%s' % (API_VERSION, _reset_schedules_path), ResetSchedulesHandler, init_params),
- (r'/api/%s' % _schedules_path, SchedulesHandler, init_params),
- (r'/api/v%s/%s' % (API_VERSION, _schedules_path), SchedulesHandler, init_params),
- (r'/api/%s' % _history_path, HistoryHandler, init_params),
- (r'/api/v%s/%s' % (API_VERSION, _history_path), HistoryHandler, init_params),
- (r'/api/%s' % _diff_path, DiffHandler, init_params),
- (r'/api/v%s/%s' % (API_VERSION, _diff_path), DiffHandler, init_params),
- (r'/?(.*)', TemplateHandler, init_params),
- ],
- static_path=os.path.join(os.path.dirname(__file__), 'dist/static'),
- template_path=os.path.join(os.path.dirname(__file__), 'dist/'),
- debug=options.debug)
- http_server = tornado.httpserver.HTTPServer(application, ssl_options=ssl_options or None)
- logger.info('Start serving on %s://%s:%d', 'https' if ssl_options else 'http',
- options.address if options.address else '127.0.0.1',
- options.port)
- http_server.listen(options.port, options.address)
- try:
- IOLoop.instance().start()
- except (KeyboardInterrupt, SystemExit):
- pass
- if __name__ == '__main__':
- serve()
|