Skip to content

zimscraperlib.zim.indexing

Special item with customized index data and helper classes

Classes:

  • IndexData

    IndexData to properly pass indexing title and content to the libzim

Functions:

Attributes:

IGNORED_MUPDF_MESSAGES module-attribute

IGNORED_MUPDF_MESSAGES = [
    "lcms: not an ICC profile, invalid signature.",
    "format error: cmsOpenProfileFromMem failed",
    "ignoring broken ICC profile",
]

IndexData

IndexData(
    title: str,
    content: str,
    keywords: str = "",
    wordcount: int | None = None,
)

Bases: IndexData

IndexData to properly pass indexing title and content to the libzim

Both title and content have to be customized (title can be identical to item title or not). keywords is optional since it can be empty wordcount is optional ; if not passed, it is automaticaly computed from content

Methods:

Attributes:

Source code in src/zimscraperlib/zim/indexing.py
26
27
28
29
30
31
32
33
def __init__(
    self, title: str, content: str, keywords: str = "", wordcount: int | None = None
):
    # set wordcount first so that we know if we should override it based on content
    self.wordcount = wordcount
    self.title = title
    self.content = content
    self.keywords = keywords

content property writable

content

keywords instance-attribute

keywords = keywords

title instance-attribute

title = title

wordcount instance-attribute

wordcount = wordcount

get_content

get_content() -> str
Source code in src/zimscraperlib/zim/indexing.py
41
42
def get_content(self) -> str:
    return self.content

get_keywords

get_keywords() -> str
Source code in src/zimscraperlib/zim/indexing.py
44
45
def get_keywords(self) -> str:
    return self.keywords

get_title

get_title() -> str
Source code in src/zimscraperlib/zim/indexing.py
38
39
def get_title(self) -> str:
    return self.title

get_wordcount

get_wordcount() -> int
Source code in src/zimscraperlib/zim/indexing.py
47
48
def get_wordcount(self) -> int:
    return self.wordcount or 0

has_indexdata

has_indexdata() -> bool
Source code in src/zimscraperlib/zim/indexing.py
35
36
def has_indexdata(self) -> bool:
    return len(self.content) > 0 or len(self.title) > 0

get_pdf_index_data

get_pdf_index_data(
    *,
    content: str | bytes | None = None,
    fileobj: BytesIO | None = None,
    filepath: Path | None = None,
) -> IndexData

Returns the IndexData information for a given PDF

PDF can be passed either as content or fileobject or filepath

Source code in src/zimscraperlib/zim/indexing.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def get_pdf_index_data(
    *,
    content: str | bytes | None = None,
    fileobj: io.BytesIO | None = None,
    filepath: pathlib.Path | None = None,
) -> IndexData:
    """Returns the IndexData information for a given PDF

    PDF can be passed either as content or fileobject or filepath
    """

    # do not display all pymupdf errors, we will filter them afterwards
    pymupdf.TOOLS.mupdf_display_errors(  # pyright: ignore[reportUnknownMemberType]
        False
    )

    if content:
        doc = pymupdf.open(stream=content)
    elif fileobj:
        doc = pymupdf.open(stream=fileobj)
    else:
        doc = pymupdf.open(filename=filepath)
    metadata = (  # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
        doc.metadata
    )
    title = ""
    if metadata:  # pragma: no branch (always metadata in test PDFs)
        parts: list[str] = []
        for key in ["title", "author", "subject"]:
            if metadata.get(key):  # pyright: ignore[reportUnknownMemberType]
                parts.append(
                    metadata[key]  # pyright: ignore[reportUnknownArgumentType]
                )
        if parts:  # pragma: no branch (always metadata in test PDFs)
            title = " - ".join(parts)

    def get_pdf_content(page: pymupdf.Page) -> str:
        text = (  # pyright: ignore[reportUnknownVariableType]
            page.get_text()  # pyright: ignore[reportUnknownMemberType]
        )
        if not isinstance(text, str):  # pragma: no cover
            raise Exception("Unexpected text content")
        return text

    content = "\n".join(get_pdf_content(page) for page in doc)

    # build list of messages and filter messages which are known to not be relevant
    # in our use-case
    mupdf_messages = "\n".join(
        warning
        for warning in pymupdf.TOOLS.mupdf_warnings().splitlines()
        if warning not in IGNORED_MUPDF_MESSAGES
    )

    if mupdf_messages:
        logger.warning(
            f"PyMuPDF issues:\n{mupdf_messages}"
        )  # pragma: no cover (no known error in test PDFs)

    return IndexData(
        title=title,
        content=content,
    )