Skip to content

zimscraperlib.inputs

Functions:

compute_descriptions

compute_descriptions(
    default_description: str,
    user_description: str | None,
    user_long_description: str | None,
) -> tuple[str, str | None]

Computes short and long descriptions compliant with ZIM standard.

Based on provided parameters, the function computes a short and a long description which are compliant with the ZIM standard (in terms of length).

User description(s) are used if set. They are checked to not exceed ZIM standard maximum length ; an error is thrown otherwise ; if ok, they are returned.

If user_description is not set, the description is computed based on the default description, truncated if needed.

If user_long_description is not set and default description is too long for the description field, the long_description is computed based on the default description (truncated if needed), otherwise no long description is returned.

Parameters:

  • default_description (str) –

    the description which will be used if user descriptions are not set (typically fetched online)

  • user_description (str | None) –

    the description set by the user (typically set by a CLI argument)

  • user_long_description (str | None) –

    the long description set by the user (typically set by a CLI argument)

Returns a tuple of (description, long_description)

Source code in src/zimscraperlib/inputs.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def compute_descriptions(
    default_description: str,
    user_description: str | None,
    user_long_description: str | None,
) -> tuple[str, str | None]:
    """Computes short and long descriptions compliant with ZIM standard.

    Based on provided parameters, the function computes a short and a long description
    which are compliant with the ZIM standard (in terms of length).

    User description(s) are used if set. They are checked to not exceed ZIM standard
    maximum length ; an error is thrown otherwise ; if ok, they are returned.

    If user_description is not set, the description is computed based on the default
    description, truncated if needed.

    If user_long_description is not set and default description is too long for the
    description field, the long_description is computed based on the default description
    (truncated if needed), otherwise no long description is returned.

    args:
        default_description:   the description which will be used if user descriptions
                               are not set (typically fetched online)
        user_description:      the description set by the user (typically set by a
                               CLI argument)
        user_long_description: the long description set by the user (typically set by a
                               CLI argument)

    Returns a tuple of (description, long_description)
    """

    if user_description and len(user_description) > MAX_DESC_LENGTH:
        raise ValueError(
            f"Description too long ({len(user_description)}>{MAX_DESC_LENGTH})"
        )
    if user_long_description and len(user_long_description) > MAX_LONG_DESC_LENGTH:
        raise ValueError(
            f"LongDescription too long ({len(user_long_description)}"
            f">{MAX_LONG_DESC_LENGTH})"
        )

    if not user_long_description and len(default_description) > MAX_DESC_LENGTH:
        user_long_description = default_description[0:MAX_LONG_DESC_LENGTH]
        if len(default_description) > MAX_LONG_DESC_LENGTH:
            user_long_description = user_long_description[:-1] + "…"
    if not user_description:
        user_description = default_description[0:MAX_DESC_LENGTH]
        if len(default_description) > MAX_DESC_LENGTH:
            user_description = user_description[:-1] + "…"

    return (user_description, user_long_description)

compute_tags

compute_tags(
    default_tags: Iterable[str], user_tags: str | None
) -> set[str]

Computes a list of tags string compliant with ZIM standard.

Based on default tags (set by the scraper) and user provided tags (usually retrived from the CLI arguments), the function computes a tag string to be used as metadata which is compliant with the ZIM standard. It removes duplicates and empty values, and strip leading and trailing whitespaces.

Parameters:

  • default_tags (Iterable[str]) –

    the list of default tags always set for a given scraper

  • user_tags (str | None) –

    the tags, separated by semi-colon, as given by user at CLI args

Returns a set of tags, ready to be passed to the creator

Source code in src/zimscraperlib/inputs.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def compute_tags(
    default_tags: Iterable[str],
    user_tags: str | None,
) -> set[str]:
    """Computes a list of tags string compliant with ZIM standard.

    Based on default tags (set by the scraper) and user provided tags (usually retrived
    from the CLI arguments), the function computes a tag string to be used as metadata
    which is compliant with the ZIM standard. It removes duplicates and empty values,
    and strip leading and trailing whitespaces.

    args:
        default_tags: the list of default tags always set for a given scraper
        user_tags:    the tags, separated by semi-colon, as given by user at CLI args

    Returns a set of tags, ready to be passed to the creator
    """

    return {
        tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
    }

handle_user_provided_file

handle_user_provided_file(
    source: Path | str | None = None,
    dest: Path | None = None,
    in_dir: Path | None = None,
    user_agent: str | None = DEFAULT_USER_AGENT,
    *,
    nocopy: bool = False,
) -> Path | None

path to downloaded or copied a user provided file (URL or path)

Parameters:

  • source (Path | str | None, default: None ) –

    URL or path to a file (or None)

  • dest (Path | None, default: None ) –

    pwhere to save the resulting file using temp filename if None

  • in_dir (Path | None, default: None ) –

    where to generate dest within if specified

  • nocopy (bool, default: False ) –

    don't make a copy of source if a path was provided. return source value instead

Source code in src/zimscraperlib/inputs.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def handle_user_provided_file(
    source: pathlib.Path | str | None = None,
    dest: pathlib.Path | None = None,
    in_dir: pathlib.Path | None = None,
    user_agent: str | None = DEFAULT_USER_AGENT,
    *,
    nocopy: bool = False,
) -> pathlib.Path | None:
    """path to downloaded or copied a user provided file (URL or path)

    args:
        source: URL or path to a file (or None)
        dest:   pwhere to save the resulting file using temp filename if None
        in_dir: where to generate dest within if specified
        nocopy: don't make a copy of source if a path was provided.
                return source value instead"""
    if not source or not str(source).strip():
        return None

    if not dest:
        dest = pathlib.Path(
            tempfile.NamedTemporaryFile(
                suffix=pathlib.Path(source).suffix, dir=in_dir, delete=False
            ).name
        )

    if str(source).startswith("http"):
        logger.debug(f"download {source} -> {dest}")
        headers = {"User-Agent": user_agent} if user_agent else None
        stream_file(url=str(source), fpath=dest, headers=headers)
    else:
        source = pathlib.Path(source).expanduser().resolve()
        if not source.exists():
            raise OSError(f"{source} could not be found.")
        if nocopy:
            return source

        logger.debug(f"copy {source} -> {dest}")
        shutil.copy(source, dest)

    return dest

unique_values

unique_values(items: list[T]) -> list[T]

Return unique values in input list while preserving list order

Source code in src/zimscraperlib/inputs.py
136
137
138
def unique_values[T](items: list[T]) -> list[T]:
    """Return unique values in input list while preserving list order"""
    return list(dict.fromkeys(items))