zimscraperlib.html

Tools to work with HTML contents

Functions:

find_language_in –

Extracted language from HTML content
find_language_in_file –

Extracted language from an HTML file
find_title_in –

Extracted title from HTML content
find_title_in_file –

Extracted title from an HTML file

find_language_in

find_language_in(
    content: str | BinaryIO | TextIO, mime_type: str
) -> str

Extracted language from HTML content

blank on failure to extract and non-HTML files

Source code in src/zimscraperlib/html.py

def find_language_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
    """Extracted language from HTML content

    blank on failure to extract and non-HTML files"""
    if mime_type != ARTICLE_MIME:
        return ""
    mapping = {"html": ["lang", "xml:lang"], "body": ["lang"], "meta": ["content"]}
    soup = BeautifulSoup(content, "lxml")
    for nodename, keylist in mapping.items():
        for key in keylist:
            node = soup.find(nodename)
            if node:
                if not isinstance(
                    node,
                    element.Tag,  # pyright:ignore[reportUnnecessaryIsInstance]
                ) or not node.has_attr(key):
                    continue
                if (
                    nodename == "meta"
                    and not node.attrs.get("http-equiv", "").lower()  # pyright:ignore[reportUnknownMemberType, reportAttributeAccessIssue]
                    == "content-language"
                ):
                    continue
                return node.attrs[key]  # pyright:ignore[reportReturnType]
    return ""

find_language_in_file

find_language_in_file(fpath: Path, mime_type: str) -> str

Extracted language from an HTML file

Source code in src/zimscraperlib/html.py

def find_language_in_file(fpath: pathlib.Path, mime_type: str) -> str:
    """Extracted language from an HTML file"""
    try:
        with open(fpath) as fh:
            return find_language_in(fh, mime_type)
    except Exception:
        return ""

find_title_in

find_title_in(
    content: str | BinaryIO | TextIO, mime_type: str | None
) -> str

Extracted title from HTML content

blank on failure to extract and non-HTML files

Source code in src/zimscraperlib/html.py

def find_title_in(content: str | BinaryIO | TextIO, mime_type: str | None) -> str:
    """Extracted title from HTML content

    blank on failure to extract and non-HTML files"""
    if mime_type != ARTICLE_MIME:
        return ""
    title_tag = BeautifulSoup(content, "lxml").find("title")
    return title_tag.text if title_tag else ""

find_title_in_file

find_title_in_file(
    fpath: Path, mime_type: str | None
) -> str

Extracted title from an HTML file

Source code in src/zimscraperlib/html.py

def find_title_in_file(fpath: pathlib.Path, mime_type: str | None) -> str:
    """Extracted title from an HTML file"""
    try:
        with open(fpath) as fh:
            return find_title_in(fh, mime_type)
    except Exception:
        return ""