# Copyright(C) 2014 Romain Bignon
#
# This file is part of woob.
#
# woob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# woob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with woob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from decimal import Decimal
from html import unescape
from urllib.parse import urljoin
import lxml.html as html
from woob.tools.html import html2text
from .base import (
_NO_DEFAULT, Filter, FilterError, _Selector, debug, ItemNotFound,
_Filter,
)
from .standard import CleanText
__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound',
'Attr', 'Link', 'AbsoluteLink',
'CleanHTML', 'FormValue', 'HasElement',
'TableCell', 'ColumnNotFound',
'ReplaceEntities',
]
[docs]class XPathNotFound(ItemNotFound):
pass
[docs]class AttributeNotFound(ItemNotFound):
pass
[docs]class ColumnNotFound(FilterError):
pass
[docs]class CSS(_Selector):
"""Select HTML elements with a CSS selector
For example::
obj_foo = CleanText(CSS('div.main'))
will take the text of all ``<div>`` having CSS class "main".
"""
[docs] def select(self, selector, item):
ret = item.cssselect(selector)
if isinstance(ret, list):
for el in ret:
if isinstance(el, html.HtmlElement):
self.highlight_el(el, item)
return ret
[docs]class XPath(_Selector):
"""Select HTML elements with a XPath selector
"""
pass
[docs]class Attr(Filter):
"""Get the text value of an HTML attribute.
Get value from attribute `attr` of HTML element matched by `selector`.
For example::
obj_foo = Attr('//img[@id="thumbnail"]', 'src')
will take the "src" attribute of ``<img>`` whose "id" is "thumbnail".
"""
def __init__(self, selector, attr, default=_NO_DEFAULT):
"""
:param selector: selector targeting the element
:param attr: name of the attribute to take
"""
super(Attr, self).__init__(selector, default=default)
self.attr = attr
[docs] @debug()
def filter(self, el):
"""
:raises: :class:`XPathNotFound` if no element is found
:raises: :class:`AttributeNotFound` if the element doesn't have the requested attribute
"""
try:
return '%s' % el[0].attrib[self.attr]
except IndexError:
return self.default_or_raise(XPathNotFound('Unable to find element %s' % self.selector))
except KeyError:
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute %s' % (el[0], self.attr)))
[docs]class Link(Attr):
"""
Get the link uri of an element.
If the ``<a>`` tag is not found, an exception `IndexError` is raised.
"""
def __init__(self, selector=None, default=_NO_DEFAULT):
super(Link, self).__init__(selector, 'href', default=default)
[docs]class AbsoluteLink(Link):
"""Get the absolute link URI of an element.
"""
def __call__(self, item):
ret = super(AbsoluteLink, self).__call__(item)
if ret:
ret = urljoin(item.page.url, ret)
return ret
[docs]class CleanHTML(Filter):
"""Convert HTML to text (Markdown) using html2text.
.. seealso:: `html2text site <https://pypi.python.org/pypi/html2text>`_
"""
def __init__(self, selector=None, options=None, default=_NO_DEFAULT):
"""
:param options: options suitable for html2text
:type options: dict
"""
super(CleanHTML, self).__init__(selector=selector, default=default)
self.options = options
[docs] @debug()
def filter(self, txt):
if isinstance(txt, (tuple, list)):
return ' '.join([self.clean(item, self.options) for item in txt])
return self.clean(txt, self.options)
[docs] @classmethod
def clean(cls, txt, options=None):
if not isinstance(txt, str):
txt = html.tostring(txt, encoding="unicode")
options = options or {}
return html2text(txt, **options)
class UnrecognizedElement(Exception):
pass
[docs]class HasElement(Filter):
"""
Returns `yesvalue` if the `selector` finds elements, `novalue` otherwise.
"""
def __init__(self, selector, yesvalue=True, novalue=False):
super(HasElement, self).__init__(selector, default=novalue)
self.yesvalue = yesvalue
[docs] @debug()
def filter(self, value):
if value:
return self.yesvalue
return self.default_or_raise(FilterError('No default value'))
[docs]class ReplaceEntities(CleanText):
"""
Filter to replace HTML entities like "é" or "B" with their unicode counterpart.
"""
[docs] def filter(self, data):
txt = super(ReplaceEntities, self).filter(data)
return unescape(txt)
[docs]class TableCell(_Filter):
"""
Used with TableElement, gets the cell element from its name.
For example:
>>> from woob.capabilities.bank import Transaction
>>> from woob.browser.elements import TableElement, ItemElement
>>> class table(TableElement):
... head_xpath = '//table/thead/th'
... item_xpath = '//table/tbody/tr'
... col_date = u'Date'
... col_label = [u'Name', u'Label']
... class item(ItemElement):
... klass = Transaction
... obj_date = Date(TableCell('date'))
... obj_label = CleanText(TableCell('label'))
...
TableCell handles table tags that have
a "colspan" attribute that modify the width of the column:
for example <td colspan="2"> will occupy two columns instead of one,
creating a column shift for all the next columns that must be taken
in consideration when trying to match columns values with column heads.
"""
def __init__(self, *names, **kwargs):
support_th = kwargs.pop('support_th', False)
kwargs.pop('colspan', True)
super(TableCell, self).__init__(**kwargs)
self.names = names
if support_th:
self.td = '(./th | ./td)[%s]'
else:
self.td = './td[%s]'
def __call__(self, item):
# New behavior, handling colspans > 1
for name in self.names:
col_idx = item.parent.get_colnum(name)
if col_idx is not None:
current_col = 0
for td_idx in range(col_idx + 1):
ret = item.xpath(self.td % (td_idx + 1))
if col_idx <= current_col:
for el in ret:
self.highlight_el(el, item)
return ret
if not ret:
# There might no be no TD at all
# ColumnNotFound seems for case when corresponding header is not found
# Thus for compat return empty
return []
current_col += int(ret[0].attrib.get('colspan', 1))
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))