2019-02-17 02:37:23 +01:00
|
|
|
import datetime
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import time
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
from sqlalchemy import (
|
|
|
|
create_engine,
|
|
|
|
Column,
|
|
|
|
Integer,
|
|
|
|
String,
|
|
|
|
DateTime,
|
|
|
|
PrimaryKeyConstraint,
|
|
|
|
ForeignKey,
|
|
|
|
)
|
2023-04-24 23:21:39 +02:00
|
|
|
from sqlalchemy.orm import sessionmaker, scoped_session
|
2019-02-17 02:37:23 +01:00
|
|
|
from sqlalchemy.sql.expression import func
|
|
|
|
|
2023-04-24 23:21:39 +02:00
|
|
|
from platypush.common.db import declarative_base
|
2019-02-17 02:37:23 +01:00
|
|
|
from platypush.config import Config
|
|
|
|
from platypush.plugins.media import MediaPlugin
|
2019-02-18 00:26:46 +01:00
|
|
|
from platypush.plugins.media.search import MediaSearcher
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
Session = scoped_session(sessionmaker())
|
|
|
|
|
2021-04-05 00:58:44 +02:00
|
|
|
|
2019-02-18 00:26:46 +01:00
|
|
|
class LocalMediaSearcher(MediaSearcher):
|
2019-02-17 02:37:23 +01:00
|
|
|
"""
|
|
|
|
This class will search for media in the local configured directories. It
|
|
|
|
will index the media files for a faster search, it will detect which
|
|
|
|
directories have been changed since the last scan and re-index their content
|
|
|
|
if needed.
|
|
|
|
"""
|
|
|
|
|
2021-04-05 00:58:44 +02:00
|
|
|
_filename_separators = r'[.,_\-@()\[\]\{\}\s\'\"]+'
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
def __init__(self, dirs, *args, **kwargs):
|
2021-05-06 23:18:47 +02:00
|
|
|
super().__init__(*args, **kwargs)
|
2019-02-17 02:37:23 +01:00
|
|
|
self.dirs = dirs
|
|
|
|
db_dir = os.path.join(Config.get('workdir'), 'media')
|
|
|
|
os.makedirs(db_dir, exist_ok=True)
|
|
|
|
self.db_file = os.path.join(db_dir, 'media.db')
|
|
|
|
self._db_engine = None
|
|
|
|
|
|
|
|
def _get_db_session(self):
|
|
|
|
if not self._db_engine:
|
|
|
|
self._db_engine = create_engine(
|
|
|
|
'sqlite:///{}'.format(self.db_file),
|
2022-04-05 22:47:44 +02:00
|
|
|
connect_args={'check_same_thread': False},
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
Base.metadata.create_all(self._db_engine)
|
|
|
|
Session.configure(bind=self._db_engine)
|
|
|
|
|
|
|
|
return Session()
|
|
|
|
|
2019-06-24 01:01:08 +02:00
|
|
|
@staticmethod
|
|
|
|
def _get_or_create_dir_entry(session, path):
|
2019-02-17 02:37:23 +01:00
|
|
|
record = session.query(MediaDirectory).filter_by(path=path).first()
|
|
|
|
if record is None:
|
|
|
|
record = MediaDirectory.build(path=path)
|
|
|
|
session.add(record)
|
|
|
|
|
|
|
|
session.commit()
|
|
|
|
return record
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _get_last_modify_time(cls, path, recursive=False):
|
2022-04-05 22:47:44 +02:00
|
|
|
return (
|
|
|
|
max([os.path.getmtime(p) for p, _, _ in os.walk(path)])
|
|
|
|
if recursive
|
|
|
|
else os.path.getmtime(path)
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
@classmethod
|
2022-04-05 22:47:44 +02:00
|
|
|
def _has_directory_changed_since_last_indexing(cls, dir_record):
|
2019-02-17 02:37:23 +01:00
|
|
|
if not dir_record.last_indexed_at:
|
|
|
|
return True
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
return (
|
|
|
|
datetime.datetime.fromtimestamp(cls._get_last_modify_time(dir_record.path))
|
|
|
|
> dir_record.last_indexed_at
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _matches_query(cls, filename, query):
|
|
|
|
filename = filename.lower()
|
2022-04-05 22:47:44 +02:00
|
|
|
query_tokens = [
|
|
|
|
_.lower() for _ in re.split(cls._filename_separators, query.strip())
|
|
|
|
]
|
2019-02-17 02:37:23 +01:00
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
return all(token in filename for token in query_tokens)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _sync_token_records(cls, session, *tokens):
|
|
|
|
tokens = list(tokens)
|
|
|
|
if not tokens:
|
|
|
|
return []
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
records = {
|
|
|
|
record.token: record
|
|
|
|
for record in session.query(MediaToken)
|
|
|
|
.filter(MediaToken.token.in_(tokens))
|
|
|
|
.all()
|
|
|
|
}
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
for token in tokens:
|
|
|
|
if token in records:
|
|
|
|
continue
|
|
|
|
record = MediaToken.build(token=token)
|
|
|
|
session.add(record)
|
2019-09-02 23:05:16 +02:00
|
|
|
records[token] = record
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
session.commit()
|
2022-04-05 22:47:44 +02:00
|
|
|
return session.query(MediaToken).filter(MediaToken.token.in_(tokens)).all()
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _get_file_records(cls, dir_record, session):
|
2022-04-05 22:47:44 +02:00
|
|
|
return session.query(MediaFile).filter_by(directory_id=dir_record.id).all()
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
def scan(self, media_dir, session=None, dir_record=None):
|
|
|
|
"""
|
|
|
|
Scans a media directory and stores the search results in the internal
|
|
|
|
SQLite index
|
|
|
|
"""
|
|
|
|
|
|
|
|
if not session:
|
|
|
|
session = self._get_db_session()
|
|
|
|
|
|
|
|
self.logger.info('Indexing directory {}'.format(media_dir))
|
|
|
|
index_start_time = time.time()
|
|
|
|
|
|
|
|
if not dir_record:
|
|
|
|
dir_record = self._get_or_create_dir_entry(session, media_dir)
|
|
|
|
|
2019-02-17 16:04:46 +01:00
|
|
|
if not os.path.isdir(media_dir):
|
2022-04-05 22:47:44 +02:00
|
|
|
self.logger.info(
|
|
|
|
'Directory {} is no longer accessible, removing it'.format(media_dir)
|
|
|
|
)
|
|
|
|
session.query(MediaDirectory).filter(
|
|
|
|
MediaDirectory.path == media_dir
|
|
|
|
).delete(synchronize_session='fetch')
|
2019-02-17 16:04:46 +01:00
|
|
|
return
|
|
|
|
|
2019-02-17 02:37:23 +01:00
|
|
|
stored_file_records = {
|
2022-04-05 22:47:44 +02:00
|
|
|
f.path: f for f in self._get_file_records(dir_record, session)
|
|
|
|
}
|
2019-02-17 02:37:23 +01:00
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
for path, _, files in os.walk(media_dir):
|
2019-02-17 02:37:23 +01:00
|
|
|
for filename in files:
|
|
|
|
filepath = os.path.join(path, filename)
|
|
|
|
|
|
|
|
if filepath in stored_file_records:
|
|
|
|
# stored_file_records will be used to keep track of the
|
|
|
|
# files that have been removed from media_dir. If the file
|
|
|
|
# is still there, don't index it again
|
|
|
|
del stored_file_records[filepath]
|
|
|
|
continue
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
if not MediaPlugin.is_video_file(
|
|
|
|
filename
|
|
|
|
) and not MediaPlugin.is_audio_file(filename):
|
2019-02-17 02:37:23 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
self.logger.debug('Syncing item {}'.format(filepath))
|
2022-04-05 22:47:44 +02:00
|
|
|
tokens = [
|
|
|
|
_.lower()
|
|
|
|
for _ in re.split(self._filename_separators, filename.strip())
|
|
|
|
]
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
token_records = self._sync_token_records(session, *tokens)
|
2022-04-05 22:47:44 +02:00
|
|
|
file_record = MediaFile.build(directory_id=dir_record.id, path=filepath)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
session.add(file_record)
|
|
|
|
session.commit()
|
2022-04-05 22:47:44 +02:00
|
|
|
file_record = (
|
|
|
|
session.query(MediaFile)
|
|
|
|
.filter_by(directory_id=dir_record.id, path=filepath)
|
|
|
|
.one()
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
for token_record in token_records:
|
2022-04-05 22:47:44 +02:00
|
|
|
file_token = MediaFileToken.build(
|
|
|
|
file_id=file_record.id, token_id=token_record.id
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
session.add(file_token)
|
|
|
|
|
|
|
|
# stored_file_records should now only contain the records of the files
|
|
|
|
# that have been removed from the directory
|
|
|
|
if stored_file_records:
|
|
|
|
self.logger.info(
|
|
|
|
'Removing references to {} deleted media items from {}'.format(
|
2022-04-05 22:47:44 +02:00
|
|
|
len(stored_file_records), media_dir
|
|
|
|
)
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
session.query(MediaFile).filter(
|
|
|
|
MediaFile.id.in_([record.id for record in stored_file_records.values()])
|
|
|
|
).delete(synchronize_session='fetch')
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
dir_record.last_indexed_at = datetime.datetime.now()
|
2022-04-05 22:47:44 +02:00
|
|
|
self.logger.info(
|
|
|
|
'Scanned {} in {} seconds'.format(
|
|
|
|
media_dir, int(time.time() - index_start_time)
|
|
|
|
)
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
session.commit()
|
|
|
|
|
2019-06-24 01:01:08 +02:00
|
|
|
def search(self, query, **kwargs):
|
2019-02-17 02:37:23 +01:00
|
|
|
"""
|
|
|
|
Searches in the configured media directories given a query. It uses the
|
|
|
|
built-in SQLite index if available. If any directory has changed since
|
|
|
|
the last scan then it will be indexed again and the up-to-date results
|
|
|
|
will be returned.
|
|
|
|
"""
|
|
|
|
|
|
|
|
session = self._get_db_session()
|
|
|
|
results = {}
|
|
|
|
|
|
|
|
for media_dir in self.dirs:
|
|
|
|
self.logger.info('Searching {} for "{}"'.format(media_dir, query))
|
|
|
|
dir_record = self._get_or_create_dir_entry(session, media_dir)
|
|
|
|
|
|
|
|
if self._has_directory_changed_since_last_indexing(dir_record):
|
2022-04-05 22:47:44 +02:00
|
|
|
self.logger.info(
|
|
|
|
'{} has changed since last indexing, '.format(media_dir)
|
|
|
|
+ 're-indexing'
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
self.scan(media_dir, session=session, dir_record=dir_record)
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
query_tokens = [
|
|
|
|
_.lower() for _ in re.split(self._filename_separators, query.strip())
|
|
|
|
]
|
|
|
|
|
|
|
|
for file_record in (
|
|
|
|
session.query(MediaFile.path)
|
|
|
|
.join(MediaFileToken)
|
|
|
|
.join(MediaToken)
|
|
|
|
.filter(MediaToken.token.in_(query_tokens))
|
|
|
|
.group_by(MediaFile.path)
|
|
|
|
.having(func.count(MediaFileToken.token_id) >= len(query_tokens))
|
|
|
|
):
|
2019-06-26 11:05:16 +02:00
|
|
|
if os.path.isfile(file_record.path):
|
2019-06-25 22:46:25 +02:00
|
|
|
results[file_record.path] = {
|
|
|
|
'url': 'file://' + file_record.path,
|
|
|
|
'title': os.path.basename(file_record.path),
|
2022-04-05 22:47:44 +02:00
|
|
|
'size': os.path.getsize(file_record.path),
|
2019-06-25 22:46:25 +02:00
|
|
|
}
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
return results.values()
|
|
|
|
|
|
|
|
|
|
|
|
# --- Table definitions
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
|
2019-02-17 02:37:23 +01:00
|
|
|
class MediaDirectory(Base):
|
2022-04-05 22:47:44 +02:00
|
|
|
"""Models the MediaDirectory table"""
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
__tablename__ = 'MediaDirectory'
|
2022-04-05 22:47:44 +02:00
|
|
|
__table_args__ = {'sqlite_autoincrement': True}
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
path = Column(String)
|
|
|
|
last_indexed_at = Column(DateTime)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def build(cls, path, last_indexed_at=None, id=None):
|
|
|
|
record = cls()
|
|
|
|
record.id = id
|
|
|
|
record.path = path
|
|
|
|
record.last_indexed_at = last_indexed_at
|
|
|
|
return record
|
|
|
|
|
|
|
|
|
|
|
|
class MediaFile(Base):
|
2022-04-05 22:47:44 +02:00
|
|
|
"""Models the MediaFile table"""
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
__tablename__ = 'MediaFile'
|
2022-04-05 22:47:44 +02:00
|
|
|
__table_args__ = {'sqlite_autoincrement': True}
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
2022-04-05 22:47:44 +02:00
|
|
|
directory_id = Column(
|
|
|
|
Integer, ForeignKey('MediaDirectory.id', ondelete='CASCADE'), nullable=False
|
|
|
|
)
|
2019-02-17 17:17:20 +01:00
|
|
|
path = Column(String, nullable=False, unique=True)
|
2019-02-17 02:37:23 +01:00
|
|
|
indexed_at = Column(DateTime)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def build(cls, directory_id, path, indexed_at=None, id=None):
|
|
|
|
record = cls()
|
|
|
|
record.id = id
|
|
|
|
record.directory_id = directory_id
|
|
|
|
record.path = path
|
|
|
|
record.indexed_at = indexed_at or datetime.datetime.now()
|
|
|
|
return record
|
|
|
|
|
|
|
|
|
|
|
|
class MediaToken(Base):
|
2022-04-05 22:47:44 +02:00
|
|
|
"""Models the MediaToken table"""
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
__tablename__ = 'MediaToken'
|
2022-04-05 22:47:44 +02:00
|
|
|
__table_args__ = {'sqlite_autoincrement': True}
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
token = Column(String, nullable=False, unique=True)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def build(cls, token, id=None):
|
|
|
|
record = cls()
|
|
|
|
record.id = id
|
|
|
|
record.token = token
|
|
|
|
return record
|
|
|
|
|
|
|
|
|
|
|
|
class MediaFileToken(Base):
|
2022-04-05 22:47:44 +02:00
|
|
|
"""Models the MediaFileToken table"""
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
__tablename__ = 'MediaFileToken'
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
file_id = Column(
|
|
|
|
Integer, ForeignKey('MediaFile.id', ondelete='CASCADE'), nullable=False
|
|
|
|
)
|
|
|
|
token_id = Column(
|
|
|
|
Integer, ForeignKey('MediaToken.id', ondelete='CASCADE'), nullable=False
|
|
|
|
)
|
2019-02-17 02:37:23 +01:00
|
|
|
|
2021-05-06 23:18:47 +02:00
|
|
|
__table_args__ = (PrimaryKeyConstraint(file_id, token_id), {})
|
2019-02-17 02:37:23 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def build(cls, file_id, token_id, id=None):
|
|
|
|
record = cls()
|
|
|
|
record.id = id
|
|
|
|
record.file_id = file_id
|
|
|
|
record.token_id = token_id
|
|
|
|
return record
|
|
|
|
|
2022-04-05 22:47:44 +02:00
|
|
|
|
2019-02-17 02:37:23 +01:00
|
|
|
# vim:sw=4:ts=4:et:
|