platypush/platypush/plugins/log/http/__init__.py

184 lines
5.5 KiB
Python

import datetime
import os
import re
from dataclasses import dataclass
from logging import getLogger
from threading import RLock
from typing import Optional, Iterable
from platypush.plugins.file.monitor import (
FileMonitorPlugin,
EventHandler,
MonitoredResource,
)
from platypush.context import get_bus
from platypush.message.event.log.http import HttpLogEvent
logger = getLogger(__name__)
class LogEventHandler(EventHandler):
http_line_regex = re.compile(
r'^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+\[([^]]+)]\s+"([^"]+)"\s+([\d]+)\s+'
r'([\d]+)\s*("([^"\s]+)")?\s*("([^"]+)")?$'
)
@dataclass
class FileResource:
path: str
pos: int = 0
lock: RLock = RLock()
last_timestamp: Optional[datetime.datetime] = None
def __init__(
self, *args, monitored_files: Optional[Iterable[str]] = None, **kwargs
):
super().__init__(*args, **kwargs)
self._monitored_files = {}
self.monitor_files(monitored_files or [])
def monitor_files(self, files: Iterable[str]):
self._monitored_files.update(
{f: self.FileResource(path=f, pos=self._get_size(f)) for f in files}
)
@staticmethod
def _get_size(file: str) -> int:
try:
return os.path.getsize(file)
except Exception as e:
logger.warning('Could not open {}: {}'.format(file, str(e)))
return 0
def on_created(self, event):
self._reset_file(event.src_path)
def on_deleted(self, event):
self._reset_file(event.src_path)
def on_moved(self, event):
self._reset_file(event.src_path)
def _reset_file(self, path: str):
file_info = self._monitored_files.get(path)
if not file_info:
return
file_info.pos = 0
def on_modified(self, event):
file_info = self._monitored_files.get(event.src_path)
if not file_info:
return
try:
file_size = os.path.getsize(event.src_path)
except OSError as e:
logger.warning(
'Could not get the size of {}: {}'.format(event.src_path, str(e))
)
return
if file_info.pos > file_size:
logger.warning(
'The size of {} been unexpectedly decreased from {} to {} bytes'.format(
event.src_path, file_info.pos, file_size
)
)
file_info.pos = 0
try:
with file_info.lock, open(event.src_path, 'r') as f:
f.seek(file_info.pos)
for line in f.readlines():
evt = self._build_event(file=event.src_path, line=line)
if evt and (
not file_info.last_timestamp
or evt.args['time'] >= file_info.last_timestamp
):
get_bus().post(evt)
file_info.last_timestamp = evt.args['time']
file_info.pos = f.tell()
except OSError as e:
logger.warning(
'Error while reading from {}: {}'.format(self.resource.path, str(e))
)
@classmethod
def _build_event(cls, file: str, line: str) -> Optional[HttpLogEvent]:
line = line.strip()
if not line:
return
m = cls.http_line_regex.match(line.strip())
if not m:
logger.warning('Could not parse log line from {}: {}'.format(file, line))
return
url = None
method = 'GET'
http_version = '1.0'
try:
url = m.group(5).split(' ')[1]
method = m.group(5).split(' ')[0]
http_version = m.group(5).split(' ')[2].split('/')[1]
except Exception as e:
logger.debug(str(e))
if not url:
return
info = {
'address': m.group(1),
'user_identifier': m.group(2),
'user_id': m.group(3),
'time': datetime.datetime.strptime(m.group(4), '%d/%b/%Y:%H:%M:%S %z'),
'method': method,
'url': url,
'http_version': http_version,
'status': int(m.group(6)),
'size': int(m.group(7)),
'referrer': m.group(9),
'user_agent': m.group(11),
}
for attr, value in info.items():
if value == '-':
info[attr] = None
return HttpLogEvent(logfile=file, **info)
class LogHttpPlugin(FileMonitorPlugin):
"""
This plugin can be used to monitor one or more HTTP log files (tested on
Apache and Nginx) and trigger events whenever a new log line is added.
"""
def __init__(
self, paths: Iterable[str], log_files: Optional[Iterable[str]] = None, **kwargs
):
"""
:param paths: List of log files to be monitored.
"""
if log_files:
self.logger.warning(
'The log_files parameter is deprecated, use paths instead'
)
paths = {os.path.expanduser(log) for log in {*paths, *(log_files or [])}}
directories = {os.path.dirname(log) for log in paths}
super().__init__(paths=directories, **kwargs)
handlers = self._observer._handlers
for hndls in handlers.values():
for hndl in hndls:
hndl.monitor_files(paths)
@staticmethod
def event_handler_from_resource(resource: str) -> LogEventHandler:
return LogEventHandler.from_resource(MonitoredResource(resource))