Source code for woob.browser.filters.standard

# Copyright(C) 2014 Romain Bignon
#
# This file is part of woob.
#
# woob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# woob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with woob. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

import datetime
import re
from collections.abc import Iterator
from decimal import Decimal, InvalidOperation
from itertools import islice
from numbers import Number
from typing import Any
from urllib.parse import parse_qs, urlparse

import pycountry
from dateutil.parser import parse as parse_date
from dateutil.tz import gettz
from lxml.etree import ElementBase as LXMLElement

from woob.browser.url import URL
from woob.capabilities.base import Currency as BaseCurrency
from woob.capabilities.base import empty
from woob.tools.misc import clean_text

from .base import _NO_DEFAULT, Filter, FilterError, ItemNotFound, _Filter, debug


__all__ = [
    "Filter",
    "FilterError",
    "RegexpError",
    "FormatError",
    "AsyncLoad",
    "Async",
    "Base",
    "Decode",
    "Env",
    "RawText",
    "CleanText",
    "Lower",
    "Upper",
    "Title",
    "Currency",
    "NumberFormatError",
    "CleanDecimal",
    "Slugify",
    "Type",
    "Field",
    "Regexp",
    "Map",
    "MapIn",
    "DateTime",
    "FromTimestamp",
    "Date",
    "DateGuesser",
    "Time",
    "Duration",
    "MultiFilter",
    "CombineDate",
    "Format",
    "BrowserURL",
    "Join",
    "MultiJoin",
    "Eval",
    "QueryValue",
    "Coalesce",
    "CountryCode",
]


[docs]class RegexpError(FilterError):
    pass


[docs]class FormatError(FilterError):
    pass


[docs]class AsyncLoad(Filter):
    """Load a page asynchronously for later use.

    Often used in combination with :class:`Async` filter.
    """

    def __call__(self, item):
        link = self.select(self.selector, item)
        return item.page.browser.async_open(link) if link else None


[docs]class Async(Filter):
    """Selector that uses another page fetched earlier.

    Often used in combination with :class:`AsyncLoad` filter.
    Requires that the other page's URL is matched with a Page by the Browser.

    Example::

        class item(ItemElement):
            load_details = Field('url') & AsyncLoad

            obj_description = Async('details') & CleanText('//h3')
    """

    def __init__(self, name, selector=None):
        super().__init__()
        self.selector = selector
        self.name = name

    def __and__(self, o):
        if isinstance(o, type) and issubclass(o, _Filter):
            o = o()
        self.selector = o
        return self

    def __call__(self, item):
        if item.loaders[self.name] is None:
            return None

        return self.select(self.selector, self.loaded_page(item).doc)

[docs]    def filter(self, *args):
        raise AttributeError()

[docs]    def loaded_page(self, item):
        result = item.loaders[self.name].result()
        assert result.page is not None, "The loaded url %s hasn't been matched by an URL object" % result.url
        return result.page


[docs]class Base(Filter):
    """
    Change the base element used in filters.

    >>> Base(Env('header'), CleanText('./h1'))  # doctest: +SKIP
    """

    def __call__(self, item):
        base = self.select(self.base, item)
        if isinstance(base, list):
            assert len(base) == 1, "If using a list, there must be one element only"
            base = base[0]
        return self.select(self.selector, base)

    def __init__(self, base, selector=None, default=_NO_DEFAULT):
        super().__init__(selector, default)
        self.base = base


[docs]class Decode(Filter):
    """
    Filter that aims to decode urlencoded strings

    >>> Decode(Env('_id'))  # doctest: +ELLIPSIS
    <woob.browser.filters.standard.Decode object at 0x...>
    >>> from .html import Link
    >>> Decode(Link('./a'))  # doctest: +ELLIPSIS
    <woob.browser.filters.standard.Decode object at 0x...>
    """

    def __call__(self, item):
        self.encoding = item.page.ENCODING if item.page.ENCODING else "utf-8"
        return self.filter(self.select(self.selector, item))

[docs]    @debug()
    def filter(self, txt):
        try:
            try:
                from urllib.parse import unquote

                txt = unquote(txt, self.encoding)
            except ImportError:
                from urllib import unquote

                txt = unquote(txt.encode("ascii")).decode(self.encoding)
        except (UnicodeDecodeError, UnicodeEncodeError):
            pass

        return txt


[docs]class Env(_Filter):
    """
    Filter to get environment value of the item.

    It is used for example to get page parameters, or when there is a parse()
    method on ItemElement.
    """

    def __init__(self, name, default=_NO_DEFAULT):
        super().__init__(default)
        self.name = name

    def __call__(self, item):
        try:
            return item.env[self.name]
        except KeyError:
            return self.default_or_raise(ItemNotFound("Environment variable %s not found" % self.name))


[docs]class RawText(Filter):
    """Get raw text from an element.

    Unlike :class:`CleanText`, whitespace is kept as is.
    """

    def __init__(self, selector=None, children=False, default=_NO_DEFAULT):
        """
        :param children: whether to get text from children elements of the select elements
        :type children: bool
        """

        super().__init__(selector, default=default)
        self.children = children

[docs]    @debug()
    def filter(self, el):
        if isinstance(el, (tuple, list)):
            return " ".join([self.filter(e) for e in el])

        if self.children:
            text = el.text_content()
        else:
            text = el.text

        if text is None:
            result = self.default
        else:
            result = str(text)

        return result


[docs]class CleanText(Filter):
    """
    Get a cleaned text from an element.

    It first replaces all tabs and multiple spaces
    (including newlines if ``newlines`` is True)
    to one space and strips the result string.

    The result is coerced into str, and optionally normalized
    according to the ``normalize`` argument.

    Then it replaces all symbols given in the ``symbols`` argument.

    >>> CleanText().filter('coucou ') == u'coucou'
    True
    >>> CleanText().filter(u'coucou\xa0coucou') == u'coucou coucou'
    True
    >>> CleanText(newlines=True).filter(u'coucou\\r\\n coucou ') == u'coucou coucou'
    True
    >>> CleanText(newlines=False).filter(u'coucou\\r\\n coucou ') == u'coucou\\ncoucou'
    True
    """

    def __init__(
        self,
        selector=None,
        symbols="",
        replace=[],
        children=True,
        newlines=True,
        transliterate=False,
        normalize="NFC",
        **kwargs,
    ):
        """
        :param symbols: list of strings to remove from text
        :type symbols: list
        :param replace: optional pairs of text replacements to perform
        :type replace: list[tuple[str, str]]
        :param children: whether to get text from children elements of the select elements
        :type children: bool
        :param newlines: if True, newlines will be converted to space too
        :type newlines: bool
        :param normalize: Unicode normalization to perform
        :type normalize: str or None
        :param transliterate: Transliterates unicode characters into ASCII characters
        :type transliterate: bool
        """

        super().__init__(selector, **kwargs)
        self.symbols = symbols
        self.toreplace = replace
        self.children = children
        self.newlines = newlines
        self.normalize = normalize
        self.transliterate = transliterate

[docs]    @debug()
    def filter(self, txt):
        if txt is None:
            return self.default_or_raise(FilterError("The text cannot be None"))
        elif isinstance(txt, int):
            txt = str(txt)
        elif isinstance(txt, (tuple, list)):
            txt = " ".join(self.clean(item, newlines=self.newlines, children=self.children) for item in txt)

        txt = self.clean(txt, self.children, self.newlines, self.normalize, self.transliterate)
        txt = self.remove(txt, self.symbols)
        txt = self.replace(txt, self.toreplace)
        return txt

[docs]    @classmethod
    def clean(cls, txt, children=True, newlines=True, normalize="NFC", transliterate=False):
        """
        Cleans the text. The children argument is ignored with Selenium.
        """
        if isinstance(txt, LXMLElement):
            if children:
                txt = list(txt.itertext())
            else:
                txt = list(txt.xpath("./text()"))
            txt = " ".join(txt)  # 'foo   bar '
        elif not isinstance(txt, str):
            txt = " ".join(txt.itertext())

        return clean_text(
            txt,
            remove_newlines=newlines,
            normalize=normalize,
            transliterate=transliterate,
        )

[docs]    @classmethod
    def remove(cls, txt, symbols):
        for symbol in symbols:
            txt = txt.replace(symbol, "")
        return txt.strip()

[docs]    @classmethod
    def replace(cls, txt, replace):
        for before, after in replace:
            txt = txt.replace(before, after)
        return txt


[docs]class Lower(CleanText):
    """Extract text with :class:`CleanText` and convert to lower-case."""

[docs]    @debug()
    def filter(self, txt):
        txt = super().filter(txt)
        return txt.lower()


[docs]class Upper(CleanText):
    """Extract text with :class:`CleanText` and convert to upper-case."""

[docs]    @debug()
    def filter(self, txt):
        txt = super().filter(txt)
        return txt.upper()


[docs]class Title(CleanText):
    """Extract text with :class:`CleanText` and apply title() to it."""

[docs]    @debug()
    def filter(self, txt):
        txt = super().filter(txt)
        return txt.title()


[docs]class Currency(CleanText):
[docs]    @debug()
    def filter(self, txt):
        txt = super().filter(txt)
        if empty(txt):
            return self.default_or_raise(FormatError("Unable to parse %r" % txt))
        return BaseCurrency.get_currency(txt)


[docs]class CountryCode(CleanText):
    """
    Filter to get the country ISO 3166-1 alpha-2 code from the country name
    """

[docs]    @debug()
    def filter(self, txt: str) -> Any:
        """Get the country code from the name of the country

        :param txt: Country name
        :type txt: str
        :raises FormatError: if the Country name is not found

        >>> CountryCode().filter('france')
        'fr'
        >>> CountryCode(default= 'll').filter('Greez')
        'll'
        """
        txt = super().filter(txt)
        if empty(txt):
            return self.default_or_raise(FormatError("Unable to parse %r" % txt))
        try:
            alpha_2_code = pycountry.countries.lookup(txt).alpha_2.lower()
        except LookupError:
            # Country not found
            return self.default_or_raise(FormatError("Country not recognized: %r" % txt))

        return alpha_2_code


[docs]class NumberFormatError(FormatError, InvalidOperation):
    pass


[docs]class CleanDecimal(CleanText):
    """
    Get a cleaned Decimal value from an element.

    `replace_dots` is False by default. A dot is interpreted as a decimal separator.

    If `replace_dots` is set to True, we remove all the dots. The ',' is used as decimal
    separator (often useful for French values)

    If `replace_dots` is a tuple, the first element will be used as the thousands separator,
    and the second as the decimal separator.

    See https://en.wikipedia.org/wiki/Thousands_separator#Examples_of_use

    For example, for the UK style (as in 1,234,567.89):

    >>> CleanDecimal('./td[1]', replace_dots=(',', '.'))  # doctest: +SKIP
    """

    def __init__(self, selector=None, replace_dots=False, sign=None, legacy=True, default=_NO_DEFAULT):
        """
        :param sign: function accepting the text as param and returning the sign
        """

        super().__init__(selector, default=default)
        self.replace_dots = replace_dots
        self.sign = sign
        self.legacy = legacy
        if not legacy:
            thousands_sep, decimal_sep = self.replace_dots
            self.matching = re.compile(
                r"([+-]?)\s*(\d[\d%s%s]*|%s\d+)" % tuple(map(re.escape, (thousands_sep, decimal_sep, decimal_sep)))
            )
            self.thousand_check = re.compile(
                r"^[+-]?\d{1,3}(%s\d{3})*(%s\d*)?$" % tuple(map(re.escape, (thousands_sep, decimal_sep)))
            )
            self.is_scientific_notation = re.compile(r"([+-]?)(\d+(?:[.,]\d*)?[eE][+-]?\d+)")

[docs]    @debug()
    def filter(self, text):
        if type(text) in (float, int):
            text = str(text)

        if empty(text):
            return self.default_or_raise(FormatError("Unable to parse %r" % text))

        original_text = text = super().filter(text)

        text = text.replace("\u2212", "-")

        if self.legacy:
            if self.replace_dots:
                if type(self.replace_dots) is tuple:
                    thousands_sep, decimal_sep = self.replace_dots
                else:
                    thousands_sep, decimal_sep = ".", ","
                text = text.replace(thousands_sep, "").replace(decimal_sep, ".")

            text = re.sub(r"[^\d\-\.]", "", text)
        else:
            thousands_sep, decimal_sep = self.replace_dots

            matches = self.matching.findall(text)
            if not matches:
                return self.default_or_raise(NumberFormatError("There is no number to parse"))
            elif len(matches) > 1:
                matches = self.is_scientific_notation.findall(text)
                if not matches:
                    return self.default_or_raise(NumberFormatError("There should be exactly one number to parse"))

            text = f"{matches[0][0]}{matches[0][1].strip()}"

            if thousands_sep and thousands_sep in text and not self.thousand_check.match(text):
                return self.default_or_raise(NumberFormatError("Thousands separator is misplaced in %r" % text))

            text = text.replace(thousands_sep, "").replace(decimal_sep, ".")

        try:
            v = Decimal(text)
        except InvalidOperation as e:
            return self.default_or_raise(NumberFormatError(e))
        else:
            if self.sign is not None:
                if callable(self.sign):
                    v *= self.sign(original_text)
                elif self.sign == "+":
                    return abs(v)
                elif self.sign == "-":
                    return -abs(v)
                else:
                    raise TypeError("'sign' should be a callable or a sign string")
            return v

[docs]    @classmethod
    def US(cls, *args, **kwargs):
        kwargs["legacy"] = False
        kwargs["replace_dots"] = (",", ".")
        return cls(*args, **kwargs)

[docs]    @classmethod
    def French(cls, *args, **kwargs):
        kwargs["legacy"] = False
        kwargs["replace_dots"] = (" ", ",")
        return cls(*args, **kwargs)

[docs]    @classmethod
    def SI(cls, *args, **kwargs):
        kwargs["legacy"] = False
        kwargs["replace_dots"] = (" ", ".")
        return cls(*args, **kwargs)

[docs]    @classmethod
    def Italian(cls, *args, **kwargs):
        kwargs["legacy"] = False
        kwargs["replace_dots"] = (".", ",")
        return cls(*args, **kwargs)


[docs]class Slugify(Filter):
[docs]    @debug()
    def filter(self, label):
        label = re.sub(r"[^A-Za-z0-9]", " ", label.lower()).strip()
        label = re.sub(r"\s+", "-", label)
        return label


[docs]class Type(Filter):
    """
    Get a cleaned value of any type from an element text.
    The type_func can be any callable (class, function, etc.).
    By default an empty string will not be parsed but it can be changed
    by specifying minlen=False. Otherwise, a minimal length can be specified.

    >>> Type(CleanText('./td[1]'), type=int)  # doctest: +SKIP

    >>> Type(type=int).filter(42)
    42
    >>> Type(type=int).filter('42')
    42
    >>> Type(type=int, default='NaN').filter('')
    'NaN'
    >>> Type(type=list, minlen=False, default=list('ab')).filter('')
    []
    >>> Type(type=list, minlen=0, default=list('ab')).filter('')
    ['a', 'b']
    """

    def __init__(self, selector=None, type=None, minlen=0, default=_NO_DEFAULT):
        super().__init__(selector, default=default)
        self.type_func = type
        self.minlen = minlen

[docs]    @debug()
    def filter(self, txt):
        if isinstance(txt, self.type_func):
            return txt
        if empty(txt):
            return self.default_or_raise(FormatError("Unable to parse %r" % txt))
        if self.minlen is not False and len(txt) <= self.minlen:
            return self.default_or_raise(FormatError("Unable to parse %r" % txt))
        try:
            return self.type_func(txt)
        except ValueError as e:
            return self.default_or_raise(FormatError(f"Unable to parse {txt!r}: {e}"))


[docs]class Field(_Filter):
    """
    Get the attribute of object.

    Example::

        obj_foo = CleanText('//h1')
        obj_bar = Field('foo')

    will make "bar" field equal to "foo" field.
    """

    def __init__(self, name):
        super().__init__()
        self.name = name

    def __call__(self, item):
        return item.use_selector(getattr(item, "obj_%s" % self.name), key=self._key)


# Based on nth from https://docs.python.org/2/library/itertools.html
def nth(iterable, n, default=None):
    "Returns the nth item or a default value, n can be negative, or '*' for all"
    if n == "*":
        return iterable
    if n < 0:
        iterable = reversed(list(iterable))
        n = abs(n) - 1
    return next(islice(iterable, n, None), default)


def ordinal(n):
    "To have some readable debug information: '*' => all, 0 => 1st, 1 => 2nd..."
    if n == "*":
        return "all"
    i = abs(n)
    n = n - 1 if n < 0 else n + 1
    return str(n) + ("th" if i > 2 else ["st", "nd", "rd"][i])


[docs]class Regexp(Filter):
    r"""
    Apply a regex.

    >>> from lxml.html import etree
    >>> doc = etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>')
    >>> Regexp(CleanText('//p'), r'Date: (\d+)/(\d+)/(\d+)', '\\3-\\2-\\1')(doc) == u'1988-08-13'
    True

    >>> (Regexp(CleanText('//body'), r'(\d+)', nth=1))(doc) == u'08'
    True
    >>> (Regexp(CleanText('//body'), r'(\d+)', nth=-1))(doc) == u'1988'
    True
    >>> (Regexp(CleanText('//body'), r'(\d+)', template='[\\1]', nth='*'))(doc) == [u'[13]', u'[08]', u'[1988]']
    True
    >>> (Regexp(CleanText('//body'), r'Date:.*'))(doc) == u'Date: 13/08/1988'
    True
    >>> (Regexp(CleanText('//body'), r'^(?!Date:).*', default=None))(doc)
    >>>
    """

    def __init__(self, selector=None, pattern=None, template=None, nth=0, flags=0, default=_NO_DEFAULT):
        super().__init__(selector, default=default)
        if pattern is None:
            raise FilterError("Missing pattern parameter")
        self.pattern = pattern

        self._regex = re.compile(pattern, flags)
        self.template = template
        self.nth = nth

[docs]    def expand(self, m):
        if self.template is None:
            try:
                return next(g for g in m.groups() if g is not None)
            except StopIteration:
                return m.string
        return self.template(m) if callable(self.template) else m.expand(self.template)

[docs]    @debug()
    def filter(self, txt):
        """
        :raises: :class:`RegexpError` if `pattern` was not found
        """

        if isinstance(txt, (tuple, list)):
            txt = " ".join([t.strip() for t in txt.itertext()])

        m = None
        try:
            m = self._regex.search(txt) if self.nth == 0 else nth(self._regex.finditer(txt), self.nth)
        except TypeError:
            msg = "%r is not a string or bytes-like object" % txt
            return self.default_or_raise(RegexpError(msg))

        if not m:
            if len(txt) > 1024:
                txt = txt[:1021] + "..."
            msg = f"Unable to find {ordinal(self.nth)} {self.pattern} in {txt!r}"
            return self.default_or_raise(RegexpError(msg))

        if isinstance(m, Iterator):
            return list(map(self.expand, m))

        return self.expand(m)


[docs]class Map(Filter):
    """Map selected value to another value using a dict.

    Example::

        TYPES = {
            'Concert': CATEGORIES.CONCERT,
            'Cinéma': CATEGORIES.CINE,
        }

        obj_type = Map(CleanText('./li'), TYPES)
    """

    def __init__(self, selector, map_dict, default=_NO_DEFAULT):
        """
        :param selector: key from `map_dict` to use
        """

        super().__init__(selector, default=default)
        self.map_dict = map_dict

[docs]    @debug()
    def filter(self, txt):
        """
        :raises: :class:`ItemNotFound` if key does not exist in dict
        """

        try:
            return self.map_dict[txt]
        except KeyError:
            return self.default_or_raise(ItemNotFound(f"Unable to handle {txt!r} on {self.map_dict!r}"))


[docs]class MapIn(Filter):
    """
    Map the pattern of a selected value to another value using a dict.
    """

    def __init__(self, selector, map_dict, default=_NO_DEFAULT):
        """
        :param selector: key from `map_dict` to use
        """
        super().__init__(selector, default=default)
        self.map_dict = map_dict

[docs]    @debug()
    def filter(self, txt):
        """
        :raises: :class:`ItemNotFound` if key pattern does not exist in dict
        """
        if txt is not None:
            for key in self.map_dict:
                if key in txt:
                    return self.map_dict[key]

        return self.default_or_raise(ItemNotFound(f"Unable to handle {txt!r} on {self.map_dict!r}"))


[docs]class DateTime(Filter):
    """Parse date and time."""

    def __init__(
        self,
        selector=None,
        default=_NO_DEFAULT,
        translations=None,
        parse_func=parse_date,
        strict=True,
        tzinfo=None,
        **kwargs,
    ):
        """
        :param dayfirst: if True, the day is the first element in the string to parse
        :type dayfirst: bool
        :param parse_func: the function to use for parsing the datetime
        :param translations: string replacements from site locale to English
        :type translations: list[tuple[str, str]]
        :param tzinfo: timezone to set if none was parsed
        :type tzinfo: :class:`datetime.tzinfo`
        """

        super().__init__(selector, default=default)
        self.kwargs = kwargs
        self.translations = translations
        self.parse_func = parse_func
        self.strict = strict
        if isinstance(tzinfo, str):
            tzinfo = gettz(tzinfo)
        self.tzinfo = tzinfo

    _default_date_1 = datetime.datetime(2100, 10, 10, 1, 1, 1)
    _default_date_2 = datetime.datetime(2120, 12, 12, 2, 2, 2)

[docs]    @debug()
    def filter(self, txt):
        if empty(txt) or txt == "":
            return self.default_or_raise(FormatError("Unable to parse %r" % txt))
        try:
            if self.translations:
                for search, repl in self.translations:
                    txt = search.sub(repl, txt)
            if self.strict:
                parse1 = self.parse_func(txt, default=self._default_date_1, **self.kwargs)
                parse2 = self.parse_func(txt, default=self._default_date_2, **self.kwargs)
                if parse1 != parse2:
                    raise FilterError("Date is not complete")
            else:
                parse1 = self.parse_func(txt, **self.kwargs)

            if parse1.tzinfo is None and self.tzinfo:
                parse1 = parse1.replace(tzinfo=self.tzinfo)

            return parse1
        except (ValueError, TypeError) as e:
            return self.default_or_raise(FormatError(f"Unable to parse {txt!r}: {e}"))


[docs]class FromTimestamp(Filter):
    """Parse a timestamp into a datetime."""

    def __init__(self, selector, millis=False, tz=None, default=_NO_DEFAULT):
        super().__init__(selector, default=default)
        self.millis = millis
        self.tz = tz

[docs]    @debug()
    def filter(self, txt):
        try:
            ts = float(txt)
        except (TypeError, ValueError) as exc:
            return self.default_or_raise(FormatError(f"Unable to parse {txt!r}: {exc}"))

        if self.millis:
            ts /= 1000

        try:
            return datetime.datetime.fromtimestamp(ts, tz=self.tz)
        except TypeError as exc:
            return self.default_or_raise(FormatError(f"Unable to parse {txt!r}: {exc}"))


[docs]class Date(DateTime):
    """Parse date."""

    def __init__(
        self, selector=None, default=_NO_DEFAULT, translations=None, parse_func=parse_date, strict=True, **kwargs
    ):
        super().__init__(
            selector, default=default, translations=translations, parse_func=parse_func, strict=strict, **kwargs
        )

    _default_date_1 = datetime.datetime(2100, 10, 10, 1, 1, 1)
    _default_date_2 = datetime.datetime(2120, 12, 12, 1, 1, 1)

[docs]    @debug()
    def filter(self, txt):
        datetime = super().filter(txt)
        if hasattr(datetime, "date"):
            return datetime.date()
        else:
            return datetime


[docs]class DateGuesser(Filter):
    def __init__(self, selector, date_guesser, **kwargs):
        super().__init__(selector)
        self.date_guesser = date_guesser
        self.kwargs = kwargs

    def __call__(self, item):
        values = self.select(self.selector, item)
        date_guesser = self.date_guesser
        # In case Env() is used to kive date_guesser.
        if isinstance(date_guesser, _Filter):
            date_guesser = self.select(date_guesser, item)

        if isinstance(values, str):
            values = re.split("[/-]", values)
        if len(values) == 2:
            day, month = map(int, values)
        else:
            raise FormatError("Unable to take (day, month) tuple from %r" % values)
        return date_guesser.guess_date(day, month, **self.kwargs)


[docs]class Time(Filter):
    """Parse time."""

    klass: type = datetime.time
    _regexp = re.compile(r"(?P<hh>\d+)[:h]?(?P<mm>\d+)([:m](?P<ss>\d+))?")
    kwargs = {"hour": "hh", "minute": "mm", "second": "ss"}

    def __init__(self, selector=None, default=_NO_DEFAULT):
        super().__init__(selector, default=default)

[docs]    @debug()
    def filter(self, txt):
        m = self._regexp.search(txt)
        if m:
            kwargs = {}
            for key, index in self.kwargs.items():
                kwargs[key] = int(m.groupdict()[index] or 0)
            return self.klass(**kwargs)

        return self.default_or_raise(FormatError("Unable to find time in %r" % txt))


[docs]class Duration(Time):
    """Parse a duration as timedelta."""

    klass: type = datetime.timedelta
    _regexp = re.compile(r"((?P<hh>\d+)[:;])?(?P<mm>\d+)[;:](?P<ss>\d+)")
    kwargs = {"hours": "hh", "minutes": "mm", "seconds": "ss"}


[docs]class MultiFilter(Filter):
    def __init__(self, *args, **kwargs):
        default = kwargs.pop("default", _NO_DEFAULT)
        super().__init__(args, default)

    def __call__(self, item):
        values = [self.select(selector, item) for selector in self.selector]
        return self.filter(tuple(values))

[docs]    def filter(self, values):
        raise NotImplementedError()


[docs]class CombineDate(MultiFilter):
    """Combine separate Date and Time filters into a single datetime."""

    def __init__(self, date, time):
        """
        :type date: filter
        :type time: filter
        """
        super().__init__(date, time)

[docs]    @debug()
    def filter(self, values):
        return datetime.datetime.combine(values[0], values[1])


[docs]class Format(MultiFilter):
    """Combine multiple filters with string-format.

    Example::

        obj_title = Format('%s (%s)', CleanText('//h1'), CleanText('//h2'))

    will concatenate the text from all ``<h1>`` and all ``<h2>`` (but put
    the latter between parentheses).
    """

    def __init__(self, fmt, *args):
        """
        :param fmt: string format suitable for "%"-formatting
        :type fmt: str
        :param args: other filters to insert in `fmt` string.
                     There should be as many args as there are "%" in `fmt`.
        """
        super().__init__(*args)
        self.fmt = fmt

[docs]    @debug()
    def filter(self, values):
        return self.fmt % values


[docs]class BrowserURL(MultiFilter):
    r"""Format URL using names in parent Browser

    This filter allows to format URL using an URL defined
    in browser instance of this page.

    .. code-block:: python

    class MyBrowser:

        mypage = URL('(?P<category>\w+)/(?P<id>\w+)')


    class OnePage(Page):
        class item(ItemElement):
            obj_myfield = BrowserURL('mypage', id=Dict('id'), category=Dict('category'))

    """

    def __init__(self, url_name, **kwargs):
        super().__init__(*kwargs.values())
        self.url_name = url_name
        self.keys = list(kwargs.keys())

    def __call__(self, item):
        values = super().__call__(item)
        url = getattr(item.page.browser, self.url_name)
        assert isinstance(url, URL), f"{type(item.page.browser).__name__}.{self.url_name} must be an URL object"
        return url.build(**dict(zip(self.keys, values)))

[docs]    @debug()
    def filter(self, values):
        return values


[docs]class Join(Filter):
    """
    Join multiple results from a selector.
    >>> Join(' - ', '//div/p')  # doctest: +SKIP

    >>> Join(pattern=', ').filter([u"Oui", u"bonjour", ""]) == u"Oui, bonjour"
    True
    >>> Join(pattern='-').filter([u"Au", u"revoir", ""]) == u"Au-revoir"
    True
    >>> Join(pattern='-').filter([]) == u""
    True
    >>> Join(pattern='-', default=u'empty').filter([]) == u'empty'
    True
    """

    def __init__(
        self,
        pattern,
        selector=None,
        textCleaner=CleanText,
        newline=False,
        addBefore="",
        addAfter="",
        default=_NO_DEFAULT,
    ):
        super().__init__(selector, default=default)
        self.pattern = pattern
        self.textCleaner = textCleaner
        self.newline = newline
        self.addBefore = addBefore
        self.addAfter = addAfter

[docs]    @debug()
    def filter(self, el):
        items = [self.textCleaner.clean(e) for e in el]
        items = [item for item in items if item]

        if self.newline:
            items = ["%s\r\n" % item for item in items]

        result = self.pattern.join(items)

        if self.addBefore:
            result = f"{self.addBefore}{result}"

        if self.addAfter:
            result = f"{result}{self.addAfter}"

        if not result and self.default is not _NO_DEFAULT:
            return self.default
        return result


[docs]class MultiJoin(MultiFilter):
    """
    Join multiple filters.
    >>> MultiJoin(Field('field1'), Field('field2'))  # doctest: +SKIP

    >>> MultiJoin(pattern=u', ').filter([u"Oui", u"bonjour", ""]) == u"Oui, bonjour"
    True
    >>> MultiJoin(pattern=u'-').filter([u"Au", u"revoir", ""]) == u"Au-revoir"
    True
    >>> MultiJoin(pattern=u'-').filter([]) == u""
    True
    >>> MultiJoin(pattern=u'-', default=u'empty').filter([]) == u'empty'
    True
    >>> MultiJoin(pattern=u'-').filter([1, 2, 3]) == u'1-2-3'
    True
    """

    def __init__(self, *args, **kwargs):
        self.pattern = kwargs.pop("pattern", ", ")
        super().__init__(*args, **kwargs)

[docs]    @debug()
    def filter(self, values):
        values = [str(v) for v in values if v]
        if not values and self.default is not _NO_DEFAULT:
            return self.default
        return self.pattern.join(values)


[docs]class Eval(MultiFilter):
    """
    Evaluate a function with given 'deferred' arguments.

    >>> F = Field; Eval(lambda a, b, c: a * b + c, F('foo'), F('bar'), F('baz')) # doctest: +SKIP
    >>> Eval(lambda x, y: x * y + 1).filter([3, 7])
    22

    Example::

        obj_ratio = Eval(lambda x: x / 100, Env('percentage'))
    """

    def __init__(self, func, *args):
        """
        :param func: function to apply to all filters. The function should
                     accept as many args as there are filters passed to
                     Eval.
        """
        super().__init__(*args)
        self.func = func

[docs]    @debug()
    def filter(self, values):
        return self.func(*values)


[docs]class QueryValue(Filter):
    """
    Extract the value of a parameter from an URL with a query string.

    >>> from lxml.html import etree
    >>> from .html import Link
    >>> f = QueryValue(Link('//a'), 'id')
    >>> f(etree.fromstring('<html><body><a href="https://example.org/view?id=1234"></a></body></html>')) == u'1234'
    True
    """

    def __init__(self, selector, key, default=_NO_DEFAULT):
        super().__init__(selector, default=default)
        self.querykey = key

[docs]    @debug()
    def filter(self, url):
        qs = parse_qs(urlparse(url).query)
        if not qs.get(self.querykey):
            return self.default_or_raise(ItemNotFound("Key %s not found" % self.querykey))
        if len(qs[self.querykey]) > 1:
            raise FilterError("More than one value for key %s" % self.querykey)
        return qs[self.querykey][0]


[docs]class Coalesce(MultiFilter):
    """
    Returns the first value that is not falsy,
    or default if all values are falsy.
    """

[docs]    @debug()
    def filter(self, values):
        for value in values:
            # Accept '0.00' as valid value for numeric elements
            if value or (isinstance(value, Number) and not empty(value)):
                return value
        return self.default_or_raise(FilterError("All falsy and no default."))
Source code for woob.browser.filters.standard

Navigation

External links

Related Topics