Source code for woob.browser.filters.html

# Copyright(C) 2014 Romain Bignon
#
# This file is part of woob.
#
# woob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# woob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with woob. If not, see <http://www.gnu.org/licenses/>.

import datetime
from decimal import Decimal
from html import unescape
from urllib.parse import urljoin

import lxml.html as html

from woob.tools.html import html2text

from .base import _NO_DEFAULT, Filter, FilterError, ItemNotFound, _Filter, _Selector, debug
from .standard import CleanText


__all__ = [
    "CSS",
    "XPath",
    "XPathNotFound",
    "AttributeNotFound",
    "Attr",
    "Link",
    "AbsoluteLink",
    "CleanHTML",
    "FormValue",
    "HasElement",
    "TableCell",
    "ColumnNotFound",
    "ReplaceEntities",
]


[docs]class XPathNotFound(ItemNotFound):
    pass


[docs]class AttributeNotFound(ItemNotFound):
    pass


[docs]class ColumnNotFound(FilterError):
    pass


[docs]class CSS(_Selector):
    """Select HTML elements with a CSS selector

    For example::

        obj_foo = CleanText(CSS('div.main'))

    will take the text of all ``<div>`` having CSS class "main".
    """

[docs]    def select(self, selector, item):
        ret = item.cssselect(selector)
        if isinstance(ret, list):
            for el in ret:
                if isinstance(el, html.HtmlElement):
                    self.highlight_el(el, item)

        return ret


[docs]class XPath(_Selector):
    """Select HTML elements with a XPath selector"""

    pass


[docs]class Attr(Filter):
    """Get the text value of an HTML attribute.

    Get value from attribute `attr` of HTML element matched by `selector`.

    For example::

        obj_foo = Attr('//img[@id="thumbnail"]', 'src')

    will take the "src" attribute of ``<img>`` whose "id" is "thumbnail".
    """

    def __init__(self, selector, attr, default=_NO_DEFAULT):
        """
        :param selector: selector targeting the element
        :param attr: name of the attribute to take
        """

        super().__init__(selector, default=default)
        self.attr = attr

[docs]    @debug()
    def filter(self, el):
        """
        :raises: :class:`XPathNotFound` if no element is found
        :raises: :class:`AttributeNotFound` if the element doesn't have the requested attribute
        """

        try:
            return "%s" % el[0].attrib[self.attr]
        except IndexError:
            return self.default_or_raise(XPathNotFound("Unable to find element %s" % self.selector))
        except KeyError:
            return self.default_or_raise(AttributeNotFound(f"Element {el[0]} does not have attribute {self.attr}"))


[docs]class Link(Attr):
    """
    Get the link uri of an element.

    If the ``<a>`` tag is not found, an exception `IndexError` is raised.
    """

    def __init__(self, selector=None, default=_NO_DEFAULT):
        super().__init__(selector, "href", default=default)


[docs]class AbsoluteLink(Link):
    """Get the absolute link URI of an element."""

    def __call__(self, item):
        ret = super().__call__(item)
        if ret:
            ret = urljoin(item.page.url, ret)
        return ret


[docs]class CleanHTML(Filter):
    """Convert HTML to text (Markdown) using html2text.

    .. seealso:: `html2text site <https://pypi.python.org/pypi/html2text>`_
    """

    def __init__(self, selector=None, options=None, default=_NO_DEFAULT):
        """
        :param options: options suitable for html2text
        :type options: dict
        """

        super().__init__(selector=selector, default=default)
        self.options = options

[docs]    @debug()
    def filter(self, txt):
        if isinstance(txt, (tuple, list)):
            return " ".join([self.clean(item, self.options) for item in txt])
        return self.clean(txt, self.options)

[docs]    @classmethod
    def clean(cls, txt, options=None):
        if not isinstance(txt, str):
            txt = html.tostring(txt, encoding="unicode")
        options = options or {}
        return html2text(txt, **options)


class UnrecognizedElement(Exception):
    pass


[docs]class FormValue(Filter):
    """
    Extract a Python value from a form element.

    Checkboxes and radio return booleans, while the rest
    return text. For ``<select>`` tags, returns the user-visible text.
    """

[docs]    @debug()
    def filter(self, el):
        try:
            el = el[0]
        except IndexError:
            return self.default_or_raise(XPathNotFound("Unable to find element %s" % self.selector))
        if el.tag == "input":
            # checkboxes or radios
            if el.attrib.get("type") in ("radio", "checkbox"):
                return "checked" in el.attrib
            # regular text input
            elif el.attrib.get("type", "") in (
                "",
                "text",
                "email",
                "search",
                "tel",
                "url",
                "password",
                "hidden",
                "color",
            ):
                try:
                    return str(el.attrib["value"])
                except KeyError:
                    return self.default_or_raise(AttributeNotFound("Element %s does not have attribute value" % el))
            # numeric input
            elif el.attrib.get("type", "") in ("number", "range"):
                try:
                    if "." in el.attrib.get("step", ""):

                        return Decimal(el.attrib["value"])
                    else:
                        return int(el.attrib["value"])
                except KeyError:
                    return self.default_or_raise(AttributeNotFound("Element %s does not have attribute value" % el))
            # datetime input
            try:
                if el.attrib.get("type", "") == "date":
                    return datetime.datetime.strptime(el.attrib["value"], "%Y-%m-%d").date()
                elif el.attrib.get("type", "") == "time":
                    return datetime.datetime.strptime(el.attrib["value"], "%H:%M").time()
                elif el.attrib.get("type", "") == "datetime-local":
                    return datetime.datetime.strptime(el.attrib["value"], "%Y-%m-%dT%H:%M")
            except KeyError:
                return self.default_or_raise(AttributeNotFound("Element %s does not have attribute value" % el))
            else:
                raise UnrecognizedElement("Element %s is not recognized" % el)
        elif el.tag == "textarea":
            return str(el.text)
        elif el.tag == "select":
            options = el.xpath(".//option[@selected]")
            # default is the first one
            if len(options) == 0:
                options = el.xpath(".//option[1]")
            return "\n".join(str(o.text) for o in options)
        else:
            raise UnrecognizedElement("Element %s is not recognized" % el)


[docs]class HasElement(Filter):
    """
    Returns `yesvalue` if the `selector` finds elements, `novalue` otherwise.
    """

    def __init__(self, selector, yesvalue=True, novalue=False):
        super().__init__(selector, default=novalue)
        self.yesvalue = yesvalue

[docs]    @debug()
    def filter(self, value):
        if value:
            return self.yesvalue
        return self.default_or_raise(FilterError("No default value"))


[docs]class ReplaceEntities(CleanText):
    """
    Filter to replace HTML entities like "&eacute;" or "&#x42;" with their unicode counterpart.
    """

[docs]    def filter(self, data):
        txt = super().filter(data)
        return unescape(txt)


[docs]class TableCell(_Filter):
    """
    Used with TableElement, gets the cell element from its name.

    For example:

    >>> from woob.capabilities.bank import Transaction
    >>> from woob.browser.elements import TableElement, ItemElement
    >>> class table(TableElement):
    ...     head_xpath = '//table/thead/th'
    ...     item_xpath = '//table/tbody/tr'
    ...     col_date =    u'Date'
    ...     col_label =   [u'Name', u'Label']
    ...     class item(ItemElement):
    ...         klass = Transaction
    ...         obj_date = Date(TableCell('date'))
    ...         obj_label = CleanText(TableCell('label'))
    ...

    TableCell handles table tags that have
    a "colspan" attribute that modify the width of the column:
    for example <td colspan="2"> will occupy two columns instead of one,
    creating a column shift for all the next columns that must be taken
    in consideration when trying to match columns values with column heads.
    """

    def __init__(self, *names, **kwargs):
        support_th = kwargs.pop("support_th", False)
        kwargs.pop("colspan", True)
        super().__init__(**kwargs)
        self.names = names

        if support_th:
            self.td = "(./th | ./td)[%s]"
        else:
            self.td = "./td[%s]"

    def __call__(self, item):
        # New behavior, handling colspans > 1
        for name in self.names:
            col_idx = item.parent.get_colnum(name)
            if col_idx is not None:
                current_col = 0
                for td_idx in range(col_idx + 1):
                    ret = item.xpath(self.td % (td_idx + 1))
                    if col_idx <= current_col:
                        for el in ret:
                            self.highlight_el(el, item)
                        return ret

                    if not ret:
                        # There might no be no TD at all
                        # ColumnNotFound seems for case when corresponding header is not found
                        # Thus for compat return empty
                        return []

                    current_col += int(ret[0].attrib.get("colspan", 1))

        return self.default_or_raise(ColumnNotFound("Unable to find column %s" % " or ".join(self.names)))
Source code for woob.browser.filters.html

Navigation

External links

Related Topics