platypush/platypush/backend/log/http.py

171 lines
5.4 KiB
Python

import datetime
import os
import re
from dataclasses import dataclass
from logging import getLogger
from threading import RLock
from typing import List, Optional, Iterable
from platypush.backend.file.monitor import FileMonitorBackend, EventHandler, MonitoredResource
from platypush.context import get_bus
from platypush.message.event.log.http import HttpLogEvent
logger = getLogger(__name__)
class LogEventHandler(EventHandler):
http_line_regex = re.compile(r'^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+\[([^]]+)]\s+"([^"]+)"\s+([\d]+)\s+'
r'([\d]+)\s*("([^"\s]+)")?\s*("([^"]+)")?$')
@dataclass
class FileResource:
path: str
pos: int = 0
lock: RLock = RLock()
last_timestamp: Optional[datetime.datetime] = None
def __init__(self, *args, monitored_files: Optional[Iterable[str]] = None, **kwargs):
super().__init__(*args, **kwargs)
self._monitored_files = {}
self.monitor_files(monitored_files or [])
def monitor_files(self, files: Iterable[str]):
self._monitored_files.update({
f: self.FileResource(path=f, pos=self._get_size(f))
for f in files
})
@staticmethod
def _get_size(file: str) -> int:
try:
return os.path.getsize(file)
except Exception as e:
logger.warning('Could not open {}: {}'.format(file, str(e)))
return 0
def on_created(self, event):
self._reset_file(event.src_path)
def on_deleted(self, event):
self._reset_file(event.src_path)
def on_moved(self, event):
self._reset_file(event.src_path)
def _reset_file(self, path: str):
file_info = self._monitored_files.get(path)
if not file_info:
return
file_info.pos = 0
def on_modified(self, event):
file_info = self._monitored_files.get(event.src_path)
if not file_info:
return
try:
file_size = os.path.getsize(event.src_path)
except OSError as e:
logger.warning('Could not get the size of {}: {}'.format(event.src_path, str(e)))
return
if file_info.pos > file_size:
logger.warning('The size of {} been unexpectedly decreased from {} to {} bytes'.format(
event.src_path, file_info.pos, file_size))
file_info.pos = 0
try:
with file_info.lock, open(event.src_path, 'r') as f:
f.seek(file_info.pos)
for line in f.readlines():
evt = self._build_event(file=event.src_path, line=line)
if evt and (not file_info.last_timestamp or evt.args['time'] >= file_info.last_timestamp):
get_bus().post(evt)
file_info.last_timestamp = evt.args['time']
file_info.pos = f.tell()
except OSError as e:
logger.warning('Error while reading from {}: {}'.format(self.resource.path, str(e)))
@classmethod
def _build_event(cls, file: str, line: str) -> Optional[HttpLogEvent]:
line = line.strip()
if not line:
return
m = cls.http_line_regex.match(line.strip())
if not m:
logger.warning('Could not parse log line from {}: {}'.format(file, line))
return
url = None
method = 'GET'
http_version = '1.0'
try:
url = m.group(5).split(' ')[1]
method = m.group(5).split(' ')[0]
http_version = m.group(5).split(' ')[2].split('/')[1]
except Exception as e:
logger.debug(str(e))
if not url:
return
info = {
'address': m.group(1),
'user_identifier': m.group(2),
'user_id': m.group(3),
'time': datetime.datetime.strptime(m.group(4), '%d/%b/%Y:%H:%M:%S %z'),
'method': method,
'url': url,
'http_version': http_version,
'status': int(m.group(6)),
'size': int(m.group(7)),
'referrer': m.group(9),
'user_agent': m.group(11),
}
for attr, value in info.items():
if value == '-':
info[attr] = None
return HttpLogEvent(logfile=file, **info)
class LogHttpBackend(FileMonitorBackend):
"""
This backend can be used to monitor one or more HTTP log files (tested on Apache and Nginx) and trigger events
whenever a new log line is added.
Triggers:
* :class:`platypush.message.event.log.http.HttpLogEvent` when a new log line is created.
Requires:
* **watchdog** (``pip install watchdog``)
"""
class EventHandlerFactory:
@staticmethod
def from_resource(resource: str) -> LogEventHandler:
resource = MonitoredResource(resource)
return LogEventHandler.from_resource(resource)
def __init__(self, log_files: List[str], **kwargs):
"""
:param log_files: List of log files to be monitored.
"""
self.log_files = {os.path.expanduser(log) for log in log_files}
directories = {os.path.dirname(log) for log in self.log_files}
super().__init__(paths=directories, **kwargs)
# noinspection PyProtectedMember
handlers = self._observer._handlers
for hndls in handlers.values():
for hndl in hndls:
hndl.monitor_files(self.log_files)