Skip to content

zimscraperlib.download

Classes:

Functions:

  • get_retry_adapter

    A requests adapter to automatically retry on known HTTP status that can be

  • get_session

    Session to hold cookies and connection pool together

  • save_large_file

    download a binary file from its URL, using wget

  • stream_file

    Stream data from a URL to either a BytesIO object or a file

BestMp4

BestMp4(**kwargs: str | bool | int | None)

Bases: YoutubeConfig

Methods:

Attributes:

Source code in src/zimscraperlib/download.py
82
83
84
85
def __init__(self, **kwargs: str | bool | int | None):
    super().__init__(self, **type(self).defaults)
    self.update(self.options)
    self.update(kwargs)

defaults class-attribute

defaults: dict[str, str | bool | int | None] = {
    "writethumbnail": True,
    "write_all_thumbnails": True,
    "writesubtitles": True,
    "allsubtitles": True,
    "subtitlesformat": "vtt",
    "keepvideo": False,
    "ignoreerrors": False,
    "retries": 20,
    "fragment-retries": 50,
    "skip-unavailable-fragments": True,
    "outtmpl": "video.%(ext)s",
}

options class-attribute

options: dict[str, str | bool | int | None] = {
    "preferredcodec": "mp4",
    "format": "best[ext=mp4]/bestvideo[ext=mp4]+bestaudio[ext=m4a]/best",
}

get_options classmethod

get_options(
    target_dir: Path | None = None,
    filepath: Path | None = None,
    **options: str | bool | int | None,
)
Source code in src/zimscraperlib/download.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@classmethod
def get_options(
    cls,
    target_dir: pathlib.Path | None = None,
    filepath: pathlib.Path | None = None,
    **options: str | bool | int | None,
):
    if "outtmpl" not in options:
        outtmpl = cls.options.get("outtmpl", cls.defaults["outtmpl"])
        if not isinstance(outtmpl, str):
            raise ValueError(f"outtmpl must be a a str, {type(outtmpl)} found")
        if filepath:
            outtmpl = str(filepath)
        # send output to target_dir
        if target_dir:
            outtmpl = str(target_dir.joinpath(outtmpl))
        options["outtmpl"] = outtmpl

    config = cls()
    config.update(options)
    return config

BestWebm

BestWebm(**kwargs: str | bool | int | None)

Bases: YoutubeConfig

Methods:

Attributes:

Source code in src/zimscraperlib/download.py
82
83
84
85
def __init__(self, **kwargs: str | bool | int | None):
    super().__init__(self, **type(self).defaults)
    self.update(self.options)
    self.update(kwargs)

defaults class-attribute

defaults: dict[str, str | bool | int | None] = {
    "writethumbnail": True,
    "write_all_thumbnails": True,
    "writesubtitles": True,
    "allsubtitles": True,
    "subtitlesformat": "vtt",
    "keepvideo": False,
    "ignoreerrors": False,
    "retries": 20,
    "fragment-retries": 50,
    "skip-unavailable-fragments": True,
    "outtmpl": "video.%(ext)s",
}

options class-attribute

options: dict[str, str | bool | int | None] = {
    "preferredcodec": "webm",
    "format": "best[ext=webm]/bestvideo[ext=webm]+bestaudio[ext=webm]/best",
}

get_options classmethod

get_options(
    target_dir: Path | None = None,
    filepath: Path | None = None,
    **options: str | bool | int | None,
)
Source code in src/zimscraperlib/download.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@classmethod
def get_options(
    cls,
    target_dir: pathlib.Path | None = None,
    filepath: pathlib.Path | None = None,
    **options: str | bool | int | None,
):
    if "outtmpl" not in options:
        outtmpl = cls.options.get("outtmpl", cls.defaults["outtmpl"])
        if not isinstance(outtmpl, str):
            raise ValueError(f"outtmpl must be a a str, {type(outtmpl)} found")
        if filepath:
            outtmpl = str(filepath)
        # send output to target_dir
        if target_dir:
            outtmpl = str(target_dir.joinpath(outtmpl))
        options["outtmpl"] = outtmpl

    config = cls()
    config.update(options)
    return config

YoutubeConfig

YoutubeConfig(**kwargs: str | bool | int | None)

Bases: dict[str, str | bool | int | None]

Methods:

Attributes:

Source code in src/zimscraperlib/download.py
82
83
84
85
def __init__(self, **kwargs: str | bool | int | None):
    super().__init__(self, **type(self).defaults)
    self.update(self.options)
    self.update(kwargs)

defaults class-attribute

defaults: dict[str, str | bool | int | None] = {
    "writethumbnail": True,
    "write_all_thumbnails": True,
    "writesubtitles": True,
    "allsubtitles": True,
    "subtitlesformat": "vtt",
    "keepvideo": False,
    "ignoreerrors": False,
    "retries": 20,
    "fragment-retries": 50,
    "skip-unavailable-fragments": True,
    "outtmpl": "video.%(ext)s",
}

options class-attribute

options: dict[str, str | bool | int | None] = {}

get_options classmethod

get_options(
    target_dir: Path | None = None,
    filepath: Path | None = None,
    **options: str | bool | int | None,
)
Source code in src/zimscraperlib/download.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@classmethod
def get_options(
    cls,
    target_dir: pathlib.Path | None = None,
    filepath: pathlib.Path | None = None,
    **options: str | bool | int | None,
):
    if "outtmpl" not in options:
        outtmpl = cls.options.get("outtmpl", cls.defaults["outtmpl"])
        if not isinstance(outtmpl, str):
            raise ValueError(f"outtmpl must be a a str, {type(outtmpl)} found")
        if filepath:
            outtmpl = str(filepath)
        # send output to target_dir
        if target_dir:
            outtmpl = str(target_dir.joinpath(outtmpl))
        options["outtmpl"] = outtmpl

    config = cls()
    config.update(options)
    return config

YoutubeDownloader

YoutubeDownloader(threads: int | None = 1)

Download YT videos using youtube_dl on a ThreadPoolExecutor with nb_workers

Shutdown method must be run explicitly to free any occupied resources

Methods:

  • download

    Downloads video using initialized executor.

  • shutdown

    shuts down the executor, awaiting completion

Attributes:

Source code in src/zimscraperlib/download.py
23
24
def __init__(self, threads: int | None = 1) -> None:
    self.executor = ThreadPoolExecutor(max_workers=threads)

executor instance-attribute

executor = ThreadPoolExecutor(max_workers=threads)

download

download(
    url: str,
    options: dict[str, Any] | None,
    *,
    wait: bool | None = True,
) -> bool | Future[Any]

Downloads video using initialized executor.

url: URL or Video ID options: youtube_dl options dict wait: whether to await download completion before returning

Returns download result of future (wait=False)

Source code in src/zimscraperlib/download.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def download(
    self,
    url: str,
    options: dict[str, Any] | None,
    *,
    wait: bool | None = True,
) -> bool | Future[Any]:
    """Downloads video using initialized executor.

    url: URL or Video ID
    options: youtube_dl options dict
    wait: whether to await download completion before returning

    Returns download result of future (wait=False)"""

    future = self.executor.submit(self._run_youtube_dl, url, options or {})
    if not wait:
        return future
    exc = future.exception()
    if isinstance(exc, BaseException):
        raise exc
    return True

shutdown

shutdown() -> None

shuts down the executor, awaiting completion

Source code in src/zimscraperlib/download.py
32
33
34
def shutdown(self) -> None:
    """shuts down the executor, awaiting completion"""
    self.executor.shutdown(wait=True)

get_retry_adapter

get_retry_adapter(
    max_retries: int | None = 5,
) -> BaseAdapter

A requests adapter to automatically retry on known HTTP status that can be

Source code in src/zimscraperlib/download.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def get_retry_adapter(
    max_retries: int | None = 5,
) -> requests.adapters.BaseAdapter:
    """A requests adapter to automatically retry on known HTTP status that can be"""
    retries = urllib3.util.retry.Retry(
        total=max_retries,  # total number of retries
        connect=max_retries,  # connection errors
        read=max_retries,  # read errors
        status=2,  # failure HTTP status (only those bellow)
        redirect=False,  # don't fail on redirections
        backoff_factor=30,  # sleep factor between retries
        status_forcelist=[
            413,
            429,
            500,
            502,
            503,
            504,
        ],  # force retry on the following codes
    )

    return requests.adapters.HTTPAdapter(max_retries=retries)

get_session

get_session(max_retries: int | None = 5) -> Session

Session to hold cookies and connection pool together

Source code in src/zimscraperlib/download.py
177
178
179
180
181
def get_session(max_retries: int | None = 5) -> requests.Session:
    """Session to hold cookies and connection pool together"""
    session = requests.Session()
    session.mount("http", get_retry_adapter(max_retries))  # tied to http and https
    return session

save_large_file

save_large_file(
    url: str,
    fpath: Path,
    retries: int = 5,
    user_agent: str | None = None,
) -> None

download a binary file from its URL, using wget

Arguments - url:

Source code in src/zimscraperlib/download.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def save_large_file(
    url: str, fpath: pathlib.Path, retries: int = 5, user_agent: str | None = None
) -> None:
    """download a binary file from its URL, using wget

    Arguments -
        url:
    """
    command = [
        "/usr/bin/env",
        "wget",
        "-t",
        f"{retries}",
        "--retry-connrefused",
        "--random-wait",
        "--progress=dot:giga",
        "-O",
        str(fpath),
        "-c",
        url,
    ]
    if user_agent:
        command += ["-U", user_agent]
    subprocess.run(
        command,
        check=True,
    )

stream_file

stream_file(
    url: str,
    fpath: Path | None = None,
    byte_stream: SupportsWrite[bytes]
    | SupportsSeekableWrite[bytes]
    | None = None,
    block_size: int | None = 1024,
    proxies: dict[str, str] | None = None,
    max_retries: int | None = 5,
    headers: dict[str, str] | None = None,
    session: Session | None = None,
    timeout: int | None = DEFAULT_WEB_REQUESTS_TIMEOUT,
    *,
    only_first_block: bool | None = False,
) -> tuple[int, CaseInsensitiveDict[str]]

Stream data from a URL to either a BytesIO object or a file Arguments - fpath - Path of the file where data is sent byte_stream - The BytesIO object where data is sent block_size - Size of each chunk of data read in one iteration proxies - A dict of proxies to be used https://requests.readthedocs.io/en/master/user/advanced/#proxies only_first_block - Whether to download only one (first) block max_retries - Maximum number of retries after which error is raised. Does not apply if using your own session session - Session object to make the request with. A new one created otherwise Returns the total number of bytes downloaded and the response headers

Source code in src/zimscraperlib/download.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def stream_file(
    url: str,
    fpath: pathlib.Path | None = None,
    byte_stream: SupportsWrite[bytes] | SupportsSeekableWrite[bytes] | None = None,
    block_size: int | None = 1024,
    proxies: dict[str, str] | None = None,
    max_retries: int | None = 5,
    headers: dict[str, str] | None = None,
    session: requests.Session | None = None,
    timeout: int | None = DEFAULT_WEB_REQUESTS_TIMEOUT,
    *,
    only_first_block: bool | None = False,
) -> tuple[int, requests.structures.CaseInsensitiveDict[str]]:
    """Stream data from a URL to either a BytesIO object or a file
    Arguments -
        fpath - Path of the file where data is sent
        byte_stream - The BytesIO object where data is sent
        block_size - Size of each chunk of data read in one iteration
        proxies - A dict of proxies to be used
        https://requests.readthedocs.io/en/master/user/advanced/#proxies
        only_first_block - Whether to download only one (first) block
        max_retries - Maximum number of retries after which error is raised. Does not
        apply if using your own session
        session - Session object to make the request with. A new one created otherwise
    Returns the total number of bytes downloaded and the response headers"""

    # if no output option is supplied
    if fpath is None and byte_stream is None:
        raise ValueError("Either file path or a bytesIO object is needed")

    if not session:
        session = get_session(max_retries)
    resp = session.get(
        url,
        stream=True,
        proxies=proxies,
        headers=headers,
        timeout=timeout,
    )
    resp.raise_for_status()

    total_downloaded = 0
    if fpath is not None:
        fpath_handler = open(fpath, "wb")
    else:
        fpath_handler = None

    for data in resp.iter_content(block_size):
        total_downloaded += len(data)
        if fpath_handler:
            fpath_handler.write(data)
        if byte_stream:
            byte_stream.write(data)

        # stop downloading/reading if we're just testing first block
        if only_first_block:
            break

    logger.debug(f"Downloaded {total_downloaded} bytes from {url}")

    if fpath_handler:
        fpath_handler.close()
    elif isinstance(byte_stream, SupportsSeekableWrite) and byte_stream.seekable():
        byte_stream.seek(0)
    return total_downloaded, resp.headers