[#407] Implemented torrent.csv backend

This commit is contained in:
Fabio Manganiello 2024-06-23 01:35:46 +02:00
parent 8fc3201b8c
commit 91e2530dd5
10 changed files with 648 additions and 49 deletions

View file

@ -30,6 +30,86 @@ from ._search import TorrentSearchProvider
class TorrentPlugin(Plugin):
"""
Plugin to search and download torrents.
Search
------
You can search for torrents using the :meth:`search` method. The method will
use the search providers configured in the ``search_providers`` attribute of
the plugin configuration. Currently supported search providers:
* ``popcorntime`:
:class:`platypush.plugins.torrent._search.PopcornTimeSearchProvider`
* ``torrents.csv``:
:class:`platypush.plugins.torrent._search.TorrentsCsvSearchProvider`
``torrents.csv`` will be enabled by default unless you explicitly disable
it. ``torrents.csv`` also supports both:
* A remote API via the ``api_url`` attribute (default:
`https://torrents-csv.com/service``). You can also run your own API
server by following the instructions at `heretic/torrents-csv-server
<https://git.torrents-csv.com/heretic/torrents-csv-server>`_.
* A local checkout of the ``torrents.csv`` file. Clone the
`heretic/torrents-csv-data
<https://git.torrents-csv.com/heretic/torrents-csv-data>`_ and provide
the path to the ``torrents.csv`` file in the ``csv_file`` attribute.
* A local checkout of the ``torrents.db`` file built from the
``torrents.csv`` file. Follow the instructions at
`heretic/torrents-csv-data
<https://git.torrents-csv.com/heretic/torrents-csv-data>`_ on how to
build the ``torrents.db`` file from the ``torrents.csv`` file.
If you opt for a local checkout of the ``torrents.csv`` file, then
Platypush will build the SQLite database from the CSV file for you - no need
to use external services. This however means that the first search will be
slower as the database is being built. Subsequent searches will be faster,
unless you modify the CSV file - in this case, an updated database will be
built from the latest CSV file.
You can also specify the ``download_csv`` property in the configuration. In
this case, Platypush will automatically download the latest ``torrents.csv``
file locally and build the SQLite database from it. On startup, Platypush
will check if either the local or remote CSV file has been updated, and
rebuild the database if necessary.
``popcorntime`` will be disabled by default unless you explicitly enable it.
That's because, at the time of writing (June 2024), there are no publicly
available PopcornTime API servers. You can run your own PopcornTime API
server by following the instructions at `popcorn-time-ru/popcorn-ru
<https://github.com/popcorn-time-ru/popcorn-ru>`_.
Configuration example:
.. code-block:: yaml
torrent:
# ...
search_providers:
torrents.csv:
# Default: True
# enabled: true
# Base URL of the torrents.csv API.
api_url: https://torrents-csv.com/service
# Alternatively, you can also use a local checkout of the
# torrents.csv file.
# csv_file: /path/to/torrents.csv
# Or a manually built SQLite database from the torrents.csv file.
# db_file: /path/to/torrents.db
# Or automatically download the latest torrents.csv file.
# download_csv: true
popcorn_time:
# Default: false
# enabled: false
# Required: PopcornTime API base URL.
api_url: https://popcorntime.app
"""
_http_timeout = 20
@ -54,41 +134,7 @@ class TorrentPlugin(Plugin):
:param download_dir: Directory where the videos/torrents will be
downloaded (default: ``~/Downloads``).
:param torrent_ports: Torrent ports to listen on (default: 6881 and 6891)
:param search_providers: List of search providers to use. Each provider
has its own supported configuration and needs to be an instance of
:class:`TorrentSearchProvider`. Currently supported providers:
* :class:`platypush.plugins.torrent._search.PopcornTimeSearchProvider`
* :class:`platypush.plugins.torrent._search.TorrentCsvSearchProvider`
Configuration example:
.. code-block:: yaml
torrent:
# ...
search_providers:
torrent_csv:
# Default: True
# enabled: true
# Base URL of the torrent-csv API.
# See https://git.torrents-csv.com/heretic/torrents-csv-server
# for how to run your own torrent-csv API server.
api_url: https://torrents-csv.com/service
# Alternatively, you can also use a local checkout of the
# torrent.csv file. Clone
# https://git.torrents-csv.com/heretic/torrents-csv-data
# and provide the path to the torrent.csv file here.
# csv_file: /path/to/torrent.csv
popcorn_time:
# Default: False
# enabled: false
# Required: PopcornTime API base URL.
# See https://github.com/popcorn-time-ru/popcorn-ru for
# how to run your own PopcornTime API server.
api_url: https://popcorntime.app
:param search_providers: List of search providers to use.
"""
super().__init__(**kwargs)
@ -96,7 +142,13 @@ class TorrentPlugin(Plugin):
self.download_dir = os.path.abspath(
os.path.expanduser(download_dir or get_default_downloads_dir())
)
self._search_providers = self._load_search_providers(search_providers)
self.logger.info(
'Loaded search providers: %s',
[provider.provider_name() for provider in self._search_providers],
)
self._sessions = {}
self._lt_session = None
pathlib.Path(self.download_dir).mkdir(parents=True, exist_ok=True)
@ -107,18 +159,22 @@ class TorrentPlugin(Plugin):
Union[Dict[str, dict], Iterable[TorrentSearchProvider]]
],
) -> Iterable[TorrentSearchProvider]:
provider_classes = {
cls.provider_name(): cls
for _, cls in inspect.getmembers(search_module, inspect.isclass)
if issubclass(cls, TorrentSearchProvider) and cls != TorrentSearchProvider
}
if not search_providers:
return []
return [
provider()
for provider in provider_classes.values()
if provider.default_enabled()
]
parsed_providers = []
if isinstance(search_providers, dict):
providers_dict = {}
provider_classes = {
cls.provider_name(): cls
for _, cls in inspect.getmembers(search_module, inspect.isclass)
if issubclass(cls, TorrentSearchProvider)
and cls != TorrentSearchProvider
}
# Configure the search providers explicitly passed in the configuration
for provider_name, provider_config in search_providers.items():
@ -523,7 +579,9 @@ class TorrentPlugin(Plugin):
:type torrent: str
"""
assert torrent in self.transfers, f"No transfer in progress for {torrent}"
if not self.transfers.get(torrent):
self.logger.info('No transfer in progress for %s', torrent)
return
self.transfers[torrent].pause()
del self.torrent_state[torrent]

View file

@ -1,10 +1,12 @@
from ._base import TorrentSearchProvider
from ._popcorntime import PopcornTimeSearchProvider
from ._torrents_csv import TorrentsCsvSearchProvider
__all__ = [
'TorrentSearchProvider',
'PopcornTimeSearchProvider',
'TorrentsCsvSearchProvider',
]

View file

@ -49,7 +49,7 @@ class TorrentSearchProvider(ABC):
)
return []
self.logger.debug('Searching for %r', query)
self.logger.debug("Searching for %r", query)
return self._search(query, *args, **kwargs)
@classmethod

View file

@ -0,0 +1,5 @@
from ._facade import TorrentsCsvSearchProvider
__all__ = ['TorrentsCsvSearchProvider']
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,100 @@
from abc import ABC, abstractmethod
from logging import getLogger
from typing import List, Optional
from urllib.parse import quote_plus
from .._base import TorrentSearchProvider
from .._model import TorrentSearchResult
class TorrentsCsvBaseProvider(TorrentSearchProvider, ABC):
"""
Base class for Torrents.csv search providers.
"""
_http_timeout = 20
_magnet_trackers = [
'http://125.227.35.196:6969/announce',
'http://210.244.71.25:6969/announce',
'http://210.244.71.26:6969/announce',
'http://213.159.215.198:6970/announce',
'http://37.19.5.139:6969/announce',
'http://37.19.5.155:6881/announce',
'http://87.248.186.252:8080/announce',
'http://asmlocator.ru:34000/1hfZS1k4jh/announce',
'http://bt.evrl.to/announce',
'http://bt.rutracker.org/ann',
'https://www.artikelplanet.nl',
'http://mgtracker.org:6969/announce',
'http://tracker.baravik.org:6970/announce',
'http://tracker.dler.org:6969/announce',
'http://tracker.filetracker.pl:8089/announce',
'http://tracker.grepler.com:6969/announce',
'http://tracker.mg64.net:6881/announce',
'http://tracker.tiny-vps.com:6969/announce',
'http://tracker.torrentyorg.pl/announce',
'https://internet.sitelio.me/',
'https://computer1.sitelio.me/',
'udp://168.235.67.63:6969',
'udp://37.19.5.155:2710',
'udp://46.148.18.250:2710',
'udp://46.4.109.148:6969',
'udp://computerbedrijven.bestelinks.nl/',
'udp://computerbedrijven.startsuper.nl/',
'udp://computershop.goedbegin.nl/',
'udp://c3t.org',
'udp://allerhandelenlaag.nl',
'udp://tracker.opentrackr.org:1337',
'udp://tracker.publicbt.com:80',
'udp://tracker.tiny-vps.com:6969',
'udp://tracker.openbittorrent.com:80',
'udp://opentor.org:2710',
'udp://tracker.ccc.de:80',
'udp://tracker.blackunicorn.xyz:6969',
'udp://tracker.coppersurfer.tk:6969',
'udp://tracker.leechers-paradise.org:6969',
]
def __init__(
self,
trackers: Optional[List[str]] = None,
**kwargs,
):
"""
:param trackers: List of additional trackers to use.
"""
super().__init__(**kwargs)
self.logger = getLogger(self.__class__.__name__)
self.trackers = list({*self._magnet_trackers, *(trackers or [])})
@classmethod
def provider_name(cls) -> str:
return 'torrents.csv'
@abstractmethod
def _search( # pylint: disable=arguments-differ
self, query: str, *_, limit: int, page: int, **__
) -> List[TorrentSearchResult]:
"""
To be implemented by subclasses.
:param query: Query string.
:param limit: Number of results to return (default: 25).
:param page: Page number (default: 1).
"""
def _to_magnet(self, info_hash: str, torrent_name: str) -> str:
"""
Generate a magnet link from an info hash and torrent name.
:param info_hash: Torrent info hash.
:param torrent_name: Torrent name.
:return: Magnet link.
"""
return (
f'magnet:?xt=urn:btih:{info_hash}&dn={quote_plus(torrent_name)}&tr='
+ '&tr='.join([quote_plus(tracker) for tracker in self.trackers])
)
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,15 @@
"""
Common Torrents CSV constants.
"""
TORRENT_CSV_API_URL = 'https://torrents-csv.com/service'
""" Default Torrents CSV API base URL. """
TORRENTS_CSV_DOWNLOAD_URL = 'https://git.torrents-csv.com/heretic/torrents-csv-data/raw/branch/main/torrents.csv'
""" Default torrents.csv download URL. """
TORRENTS_CSV_URL_LAST_CHECKED_VAR = '_TORRENTS_CSV_URL_LAST_CHECKED'
""" Environment variable to store the last checked timestamp for the torrents.csv URL. """
TORRENTS_CSV_DEFAULT_CHECK_INTERVAL = 60 * 60 * 24
""" Interval in seconds to re-check the torrents.csv URL (24 hours). """

View file

@ -0,0 +1,84 @@
import os
from typing import List, Optional
from .._model import TorrentSearchResult
from ._base import TorrentsCsvBaseProvider
from ._constants import (
TORRENT_CSV_API_URL,
TORRENTS_CSV_DOWNLOAD_URL,
TORRENTS_CSV_DEFAULT_CHECK_INTERVAL,
)
from .api import TorrentsCsvAPIProvider
from .local import TorrentsCsvLocalProvider
class TorrentsCsvSearchProvider(TorrentsCsvBaseProvider):
"""
Torrent that uses `Torrents.csv <https://torrents-csv.com/>`_ to search
for torrents, either by using the API or by leveraging a local database.
"""
def __init__(
self,
api_url: str = TORRENT_CSV_API_URL,
csv_url: str = TORRENTS_CSV_DOWNLOAD_URL,
download_csv: bool = False,
csv_path: Optional[str] = None,
db_path: Optional[str] = None,
csv_url_check_interval: int = TORRENTS_CSV_DEFAULT_CHECK_INTERVAL,
**kwargs
):
"""
:param api_url: Torrents.csv API URL.
:param csv_url: Torrents.csv CSV URL.
:param download_csv: Whether to download the CSV file.
:param csv_path: Path to the CSV file.
:param db_path: Path to the SQLite database file.
:param csv_url_check_interval: Interval to check for CSV updates.
"""
super().__init__(**kwargs)
self.api_url = api_url
self.csv_url = csv_url
self.download_csv = download_csv
self.csv_path = os.path.expanduser(csv_path) if csv_path else None
self.db_path = os.path.expanduser(db_path) if db_path else None
self.csv_url_check_interval = csv_url_check_interval
@property
def _delegate(self) -> TorrentsCsvBaseProvider:
"""
:return: The provider to delegate the search to.
"""
if self.download_csv or self.csv_path or self.db_path:
return TorrentsCsvLocalProvider(
download_csv=self.download_csv,
csv_url=self.csv_url,
csv_path=self.csv_path,
db_path=self.db_path,
csv_url_check_interval=self.csv_url_check_interval,
enabled=True,
)
return TorrentsCsvAPIProvider(api_url=self.api_url, enabled=True)
@classmethod
def default_enabled(cls) -> bool:
"""
This provider is enabled by default.
"""
return True
def _search(
self, query: str, *_, limit: int = 25, page: int = 1, **__
) -> List[TorrentSearchResult]:
"""
Perform a search of torrents using the Torrent.csv API.
:param query: Query string.
:param limit: Number of results to return (default: 25).
:param page: Page number (default: 1).
"""
return list(self._delegate.search(query=query, limit=limit, page=page))
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,59 @@
from typing import List
import requests
from .._model import TorrentSearchResult
from ._base import TorrentsCsvBaseProvider
class TorrentsCsvAPIProvider(TorrentsCsvBaseProvider):
"""
Torrent that uses `Torrents.csv <https://torrents-csv.com/>`_ or any other
`Torrents.csv API <https://torrents-csv.com/service>`_ instance to search
for torrents.
"""
def __init__(self, api_url: str, **kwargs):
"""
:param api_url: Torrents.csv API base URL.
"""
super().__init__(**kwargs)
self.api_url = api_url
def _search(
self, query: str, *_, limit: int, page: int, **__
) -> List[TorrentSearchResult]:
"""
Perform a search of torrents using the Torrent.csv API.
:param query: Query string.
:param limit: Number of results to return (default: 25).
:param page: Page number (default: 1).
"""
response = requests.get(
f'{self.api_url}/search',
params={
'q': query,
'size': limit,
'page': page,
},
timeout=self._http_timeout,
)
response.raise_for_status()
return [
TorrentSearchResult(
title=torrent.get('name', '[No Title]'),
url=self._to_magnet(
info_hash=torrent.get('infohash'), torrent_name=torrent.get('name')
),
size=torrent.get('size_bytes'),
created_at=torrent.get('created_unix'),
seeds=torrent.get('seeders'),
peers=torrent.get('leechers'),
)
for torrent in response.json().get('torrents', [])
]
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,272 @@
import datetime as dt
import os
import pathlib
import re
import stat
import subprocess
import time
from threading import RLock
from typing import List, Optional
from urllib.parse import quote_plus
import requests
from sqlalchemy import create_engine, text
from platypush.config import Config
from platypush.context import Variable
from .._model import TorrentSearchResult
from ._base import TorrentsCsvBaseProvider
from ._constants import TORRENTS_CSV_URL_LAST_CHECKED_VAR
SQL_INIT_TEMPLATE = """
create table torrent_tmp (
infohash text primary key,
name text not null,
size_bytes integer not null,
created_unix integer(4) not null,
seeders integer not null,
leechers integer not null,
completed integer not null,
scraped_date integer(4) not null,
published integer(4) not null
);
.separator ,
.import --skip 1 '{csv_file}' torrent_tmp
create index idx_name on torrent_tmp(lower(name));
create index idx_seeders on torrent_tmp(seeders);
create index idx_created_unix on torrent_tmp(created_unix);
drop table if exists torrent;
alter table torrent_tmp rename to torrent;
"""
class TorrentsCsvLocalProvider(TorrentsCsvBaseProvider):
"""
This class is responsible for managing a local checkout of the torrents-csv
dataset.
"""
def __init__(
self,
download_csv: bool,
csv_url: str,
csv_url_check_interval: int,
csv_path: Optional[str] = None,
db_path: Optional[str] = None,
**kwargs,
):
"""
Note that at least one among ``download_csv``, ``csv_path`` and ``db_path``
should be provided.
:param download_csv: If True then the CSV file will be downloaded from the
specified ``csv_url``.
:param csv_url: The URL from which the CSV file will be downloaded.
:param csv_url_check_interval: The interval in seconds after which the CSV
should be checked for updates.
:param csv_path: The path to the CSV file. If not provided, and download_csv
is set to True, then the CSV file will be downloaded to
``<WORKDIR>/torrent/torrents.csv``.
:param db_path: The path to the SQLite database. If not provided, and
``csv_path`` or ``download_csv`` are set, then the database will be created
from a local copy of the CSV file.
"""
super().__init__(**kwargs)
assert (
download_csv or csv_path or db_path
), 'You must provide either download_csv, csv_path or db_path'
self._init_csv_lock = RLock()
self._init_db_lock = RLock()
self._csv_url_check_interval = csv_url_check_interval
if download_csv:
csv_path = (
os.path.expanduser(csv_path)
if csv_path
else os.path.join(Config.get_workdir(), 'torrent', 'torrents.csv')
)
with self._init_csv_lock:
self._download_csv(csv_url=csv_url, csv_path=csv_path)
if csv_path:
db_path = (
os.path.expanduser(db_path)
if db_path
else os.path.join(os.path.dirname(csv_path), 'torrents.db')
)
with self._init_db_lock:
self._build_db(csv_path=csv_path, db_path=db_path)
assert db_path, 'No download_csv, csv_path or db_path provided'
assert os.path.isfile(db_path), f'Invalid db_path: {db_path}'
self.db_path = db_path
def _get_engine(self):
return create_engine(
'sqlite:///' + ('/'.join(map(quote_plus, self.db_path.split(os.path.sep))))
)
def _download_csv(self, csv_url: str, csv_path: str):
if not self._should_download_csv(
csv_url=csv_url,
csv_path=csv_path,
csv_url_check_interval=self._csv_url_check_interval,
):
return
self.logger.info(
'Downloading the torrents CSV file from %s to %s', csv_url, csv_path
)
response = requests.get(csv_url, stream=True, timeout=60)
response.raise_for_status()
size = int(response.headers.get('Content-Length', 0))
torrents_csv_dir = os.path.dirname(csv_path)
pathlib.Path(torrents_csv_dir).mkdir(parents=True, exist_ok=True)
with open(csv_path, 'wb') as f:
written = 0
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
written += len(chunk)
if size:
percent = 100.0 * written / size
prev_percent = max(0, 100.0 * (written - len(chunk)) / size)
if round(percent / 5) > round(prev_percent / 5):
self.logger.info('... %.2f%%\r', percent)
self.logger.info('Downloaded the torrents CSV file to %s', csv_path)
def _build_db(self, csv_path: str, db_path: str):
if not self._should_update_db(csv_path, db_path):
return
self.logger.info(
'Refreshing SQLite database %s from CSV file %s', db_path, csv_path
)
db_dir = os.path.dirname(db_path)
pathlib.Path(db_dir).mkdir(parents=True, exist_ok=True)
with subprocess.Popen(
['sqlite3', db_path], stdin=subprocess.PIPE, text=True
) as proc:
proc.communicate(SQL_INIT_TEMPLATE.format(csv_file=csv_path))
self.logger.info(
'Refreshed SQLite database %s from CSV file %s: ready to search',
db_path,
csv_path,
)
@staticmethod
def _should_update_db(csv_path: str, db_path: str) -> bool:
if not os.path.isfile(csv_path):
return False
if not os.path.isfile(db_path):
return True
return os.stat(db_path)[stat.ST_MTIME] < os.stat(csv_path)[stat.ST_MTIME]
def _should_download_csv(
self, csv_url: str, csv_path: str, csv_url_check_interval: int
) -> bool:
if not os.path.isfile(csv_path):
self.logger.info('CSV file %s not found, downloading it', csv_path)
return True
if not self._should_check_csv_url(csv_url_check_interval):
self.logger.debug('No need to check the CSV URL %s', csv_url)
return False
request = requests.head(csv_url, timeout=10)
request.raise_for_status()
last_modified_hdr = request.headers.get('Last-Modified')
Variable(TORRENTS_CSV_URL_LAST_CHECKED_VAR).set(time.time())
if not last_modified_hdr:
self.logger.debug(
"No Last-Modified header found in the CSV URL, can't compare thus downloading"
)
return True
return (
time.mktime(time.strptime(last_modified_hdr, '%a, %d %b %Y %H:%M:%S %Z'))
> os.stat(csv_path)[stat.ST_MTIME]
)
@staticmethod
def _should_check_csv_url(csv_url_check_interval: int) -> bool:
last_checked = round(
float(Variable(TORRENTS_CSV_URL_LAST_CHECKED_VAR).get() or 0)
)
return bool(
csv_url_check_interval
and time.time() - last_checked > csv_url_check_interval
)
def _search(
self, query: str, *_, limit: int, page: int, **__
) -> List[TorrentSearchResult]:
self.logger.debug(
"Searching for %r on %s, limit=%d, page=%d",
query,
self.db_path,
limit,
page,
)
tokens = re.split(r'[^\w]', query.lower())
where = ' and '.join(
f'lower(name) like :token{i}' for i, _ in enumerate(tokens)
)
tokens = {f'token{i}': f'%{token}%' for i, token in enumerate(tokens)}
with self._get_engine().connect() as conn:
self.logger.debug('Connected to the database: %s', conn.engine.url)
results = conn.execute(
text(
f"""
select infohash, name, size_bytes, seeders, leechers, created_unix
from torrent
where {where}
order by seeders desc, created_unix desc
limit :limit
offset :offset
"""
),
{
**tokens,
'limit': max(int(limit), 0),
'offset': max(int(limit * (page - 1)), 0),
},
).all()
self.logger.debug('Found %d results', len(results))
return [
TorrentSearchResult(
title=result[1],
url=self._to_magnet(
info_hash=result[0],
torrent_name=result[1],
),
size=result[2],
seeds=int(result[3] or 0),
peers=int(result[4] or 0),
created_at=(
dt.datetime.fromtimestamp(result[5]).replace(tzinfo=dt.timezone.utc)
if result[5]
else None
),
)
for result in results
]

View file

@ -3,16 +3,20 @@
"events": {},
"install": {
"apk": [
"py3-libtorrent-rasterbar"
"py3-libtorrent-rasterbar",
"sqlite3"
],
"apt": [
"python3-libtorrent"
"python3-libtorrent",
"sqlite3"
],
"dnf": [
"rb_libtorrent-python3"
"rb_libtorrent-python3",
"sqlite3"
],
"pacman": [
"libtorrent-rasterbar"
"libtorrent-rasterbar",
"sqlite3"
],
"pip": [
"libtorrent"
@ -21,4 +25,4 @@
"package": "platypush.plugins.torrent",
"type": "plugin"
}
}
}