Skip to content

zimscraperlib.zim.items

libzim Item helpers

Classes:

  • Item

    libzim.writer.Item returning props for path/title/mimetype

  • StaticItem

    scraperlib Item with auto contentProvider from content or filepath

  • URLItem

    StaticItem to automatically fetch and feed an URL resource

Functions:

Item

Item(
    path: str | None = None,
    title: str | None = None,
    mimetype: str | None = None,
    hints: dict[Hint, int] | None = None,
    **kwargs: Any,
)

Bases: Item

libzim.writer.Item returning props for path/title/mimetype

Methods:

Attributes:

Source code in src/zimscraperlib/zim/items.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(
    self,
    path: str | None = None,
    title: str | None = None,
    mimetype: str | None = None,
    hints: dict[libzim.writer.Hint, int] | None = None,
    **kwargs: Any,
):
    super().__init__()
    if path is not None:
        kwargs["path"] = path
    if title is not None:
        kwargs["title"] = title
    if mimetype is not None:
        kwargs["mimetype"] = mimetype
    if hints is not None:
        kwargs["hints"] = hints
    for k, v in kwargs.items():
        setattr(self, k, v)

should_index property

should_index

get_hints

get_hints() -> dict[Hint, int]
Source code in src/zimscraperlib/zim/items.py
60
61
def get_hints(self) -> dict[libzim.writer.Hint, int]:
    return getattr(self, "hints", {})

get_mimetype

get_mimetype() -> str
Source code in src/zimscraperlib/zim/items.py
57
58
def get_mimetype(self) -> str:
    return getattr(self, "mimetype", "")

get_path

get_path() -> str
Source code in src/zimscraperlib/zim/items.py
51
52
def get_path(self) -> str:
    return getattr(self, "path", "")

get_title

get_title() -> str
Source code in src/zimscraperlib/zim/items.py
54
55
def get_title(self) -> str:
    return getattr(self, "title", "")

StaticItem

StaticItem(
    content: str | bytes | None = None,
    fileobj: IOBase | None = None,
    filepath: Path | None = None,
    path: str | None = None,
    title: str | None = None,
    mimetype: str | None = None,
    hints: dict[Hint, int] | None = None,
    index_data: IndexData | None = None,
    *,
    auto_index: bool = True,
    **kwargs: Any,
)

Bases: Item

scraperlib Item with auto contentProvider from content or filepath

Sets a ref to itself on the File/String content providers so it outlives them We need Item to survive its ContentProvider so that we can track lifecycle more efficiently: now when the libzim destroys the CP, python will destroy the Item and we can be notified that we're effectively through with our content

By default, content is automatically indexed (either by the libzim itself for supported documents - text or html for now or by the python-scraperlib - only PDF supported for now). If you do not want this, set auto_index to False to disable both indexing (libzim and python-scraperlib).

It is also possible to pass index_data to configure custom indexing of the item.

If item title is not set by caller, it is automatically populated from index_data.

Methods:

Attributes:

Source code in src/zimscraperlib/zim/items.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def __init__(
    self,
    content: str | bytes | None = None,
    fileobj: io.IOBase | None = None,
    filepath: pathlib.Path | None = None,
    path: str | None = None,
    title: str | None = None,
    mimetype: str | None = None,
    hints: dict[libzim.writer.Hint, int] | None = None,
    index_data: IndexData | None = None,
    *,
    auto_index: bool = True,
    **kwargs: Any,
):
    if content is not None:
        kwargs["content"] = content
    if fileobj is not None:
        kwargs["fileobj"] = fileobj
    if filepath is not None:
        kwargs["filepath"] = filepath
    super().__init__(
        path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
    )
    if index_data:
        self.get_indexdata: Callable[[], IndexData] = lambda: index_data
    elif not auto_index:
        self.get_indexdata = no_indexing_indexdata  # index nothing
    else:
        self._get_auto_index()  # consider to add auto index

    # Populate item title from index data if title is not set by caller
    if (not getattr(self, "title", None)) and hasattr(self, "get_indexdata"):
        title = self.get_indexdata().get_title()
        if title:
            self.title = title

get_indexdata instance-attribute

get_indexdata: Callable[[], IndexData] = lambda: index_data

should_index property

should_index

title instance-attribute

title = title

get_contentprovider

get_contentprovider() -> ContentProvider
Source code in src/zimscraperlib/zim/items.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def get_contentprovider(self) -> libzim.writer.ContentProvider:
    # content was set manually
    content = getattr(self, "content", None)
    if content is not None:
        if not isinstance(content, str | bytes):
            raise AttributeError(f"Unexpected type for content: {type(content)}")
        return StringProvider(content=content, ref=self)

    # using a file-like object
    fileobj = getattr(self, "fileobj", None)
    if fileobj:
        return FileLikeProvider(
            fileobj=fileobj, ref=self, size=getattr(self, "size", None)
        )

    # we had to download locally to get size
    filepath = getattr(self, "filepath", None)
    if filepath:
        return FileProvider(
            filepath=filepath, ref=self, size=getattr(self, "size", None)
        )

    raise NotImplementedError("No data to provide`")

get_hints

get_hints() -> dict[Hint, int]
Source code in src/zimscraperlib/zim/items.py
60
61
def get_hints(self) -> dict[libzim.writer.Hint, int]:
    return getattr(self, "hints", {})

get_mimetype

get_mimetype() -> str
Source code in src/zimscraperlib/zim/items.py
57
58
def get_mimetype(self) -> str:
    return getattr(self, "mimetype", "")

get_path

get_path() -> str
Source code in src/zimscraperlib/zim/items.py
51
52
def get_path(self) -> str:
    return getattr(self, "path", "")

get_title

get_title() -> str
Source code in src/zimscraperlib/zim/items.py
54
55
def get_title(self) -> str:
    return getattr(self, "title", "")

URLItem

URLItem(
    url: str,
    path: str | None = None,
    title: str | None = None,
    mimetype: str | None = None,
    hints: dict[Hint, int] | None = None,
    *,
    use_disk: bool | None = None,
    **kwargs: Any,
)

Bases: StaticItem

StaticItem to automatically fetch and feed an URL resource

Appropriate for retrieving/bundling static assets that you don't need to post-process.

Uses URL's path as zim path if none provided Keeps single in-memory copy of content for HTML resources (indexed) Works transparently on servers returning a Content-Length header (most) Swaps a copy of the content either in memory or on disk (use_disk=True) in case the content size could not be retrieved from headers. Use tmp_dir to point location of that temp file.

Methods:

Attributes:

Source code in src/zimscraperlib/zim/items.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
def __init__(
    self,
    url: str,
    path: str | None = None,
    title: str | None = None,
    mimetype: str | None = None,
    hints: dict[libzim.writer.Hint, int] | None = None,
    *,
    use_disk: bool | None = None,
    **kwargs: Any,
):
    if use_disk is not None:
        kwargs["use_disk"] = use_disk
    super().__init__(
        path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
    )
    self.url = urllib.parse.urlparse(url)
    use_disk_set: bool = getattr(self, "use_disk", False)

    # fetch headers to retrieve size and type
    try:
        _, self.headers = stream_file(
            url, byte_stream=io.BytesIO(), only_first_block=True
        )
    except Exception as exc:
        raise OSError(f"Unable to access URL at {url}: {exc}") from None

    # HTML content will be indexed.
    # we proxy the content in the Item to prevent double-download of the resource
    # we use a value-variable to prevent race-conditions in the multiple
    # reads of the content in the provider
    if self.should_index:
        self.fileobj = io.BytesIO()
        self.size, _ = stream_file(self.url.geturl(), byte_stream=self.fileobj)
        return

    try:
        # Encoded data (compressed) prevents us from using Content-Length header
        # as source for the content (it represents length of compressed data)
        if self.headers.get("Content-Encoding", "identity") != "identity":
            raise ValueError("Can't trust Content-Length for size")
        # non-html, non-compressed data.
        self.size = int(self.headers["Content-Length"])
    except Exception:
        # we couldn't retrieve size so we have to download resource to
        target, self.size = self.download_for_size(
            self.url, on_disk=use_disk_set, tmp_dir=getattr(self, "tmp_dir", None)
        )
        # downloaded to disk and using a file path from now on
        if use_disk:
            self.filepath = target
        # downloaded to RAM and using a bytes object
        else:
            self.fileobj = target

fileobj instance-attribute

fileobj = BytesIO()

filepath instance-attribute

filepath = target

get_indexdata instance-attribute

get_indexdata: Callable[[], IndexData] = lambda: index_data

should_index property

should_index

size instance-attribute

size = int(headers['Content-Length'])

title instance-attribute

title = title

url instance-attribute

url = urlparse(url)

download_for_size staticmethod

download_for_size(
    url: ParseResult,
    tmp_dir: Path | None = None,
    *,
    on_disk: bool,
)

Download URL to a temp file and return its tempfile and size

Source code in src/zimscraperlib/zim/items.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
@staticmethod
def download_for_size(
    url: urllib.parse.ParseResult,
    tmp_dir: pathlib.Path | None = None,
    *,
    on_disk: bool,
):
    """Download URL to a temp file and return its tempfile and size"""
    fpath = stream = None
    if on_disk:
        suffix = pathlib.Path(re.sub(r"^/", "", url.path)).suffix
        fpath = pathlib.Path(
            tempfile.NamedTemporaryFile(
                suffix=suffix, delete=False, dir=tmp_dir
            ).name
        )
    else:
        stream = io.BytesIO()
    size, _ = stream_file(url.geturl(), fpath=fpath, byte_stream=stream)
    return fpath or stream, size

get_contentprovider

get_contentprovider()
Source code in src/zimscraperlib/zim/items.py
299
300
301
302
303
304
305
def get_contentprovider(self):
    try:
        return super().get_contentprovider()
    except NotImplementedError:
        return URLProvider(
            url=self.url.geturl(), size=getattr(self, "size", None), ref=self
        )

get_hints

get_hints() -> dict[Hint, int]
Source code in src/zimscraperlib/zim/items.py
60
61
def get_hints(self) -> dict[libzim.writer.Hint, int]:
    return getattr(self, "hints", {})

get_mimetype

get_mimetype() -> str
Source code in src/zimscraperlib/zim/items.py
292
293
294
295
296
297
def get_mimetype(self) -> str:
    return getattr(
        self,
        "mimetype",
        self.headers.get("Content-Type", "application/octet-stream"),
    )

get_path

get_path() -> str
Source code in src/zimscraperlib/zim/items.py
286
287
def get_path(self) -> str:
    return getattr(self, "path", re.sub(r"^/", "", self.url.path))

get_title

get_title() -> str
Source code in src/zimscraperlib/zim/items.py
289
290
def get_title(self) -> str:
    return getattr(self, "title", "")

no_indexing_indexdata

no_indexing_indexdata() -> IndexData

IndexData asking libzim not to index this item

Source code in src/zimscraperlib/zim/items.py
64
65
66
def no_indexing_indexdata() -> IndexData:
    """IndexData asking libzim not to index this item"""
    return IndexData("", "")