Source code for woob.browser.filters.file
# Copyright(C) 2023 Powens
#
# This file is part of woob.
#
# woob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# woob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with woob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import mimetypes
from os.path import splitext
from typing import Any
from woob.browser.filters.standard import CleanText, FormatError
from woob.capabilities.base import empty
from woob.tools.misc import NO_DEFAULT
from .base import debug
__all__ = ['MimeType', 'FileExtension']
[docs]class MimeType(CleanText):
"""
A filter to determine the MIME Type (Multipurpose Internet Mail Extensions)
of a file based on a given string, which can be a file path or a file name with an extension.
:param default: The default MIME type to be returned when the file type is not recognized.
:type default: Any, optional
"""
[docs] @debug()
def filter(self, txt: str) -> Any:
"""
Get the MIME type from a file name or path.
:param txt: The file name or path for which to determine the MIME type.
:type txt: str
:raises FormatError: If the MIME type is not recognized.
>>> MimeType().filter('foo.pdf')
'application/pdf'
>>> MimeType().filter('path/foo/invoices.tar.gz')
'application/x-tar'
>>> MimeType(default='NAN').filter('foo.no')
'NAN'
"""
txt = super().filter(txt)
if empty(txt):
return self.default_or_raise(FormatError(f'Unable to parse {txt}'))
# The 'mimetypes.guess_type()' function requires a valid
# file name with an extension (file_name.extension)
# to determine the MIME type. It may not handle inputs without a dot ('pdf'),
# or with a dot but no name ('.pdf').
if txt.startswith('.'):
# .pdf
txt = f'dummy_filename{txt}'
if '.' not in txt:
# pdf
txt = f'dummy_filename.{txt}'
mime_type, _ = mimetypes.guess_type(txt)
if not mime_type:
return self.default_or_raise(
FormatError(f'MIME type not recognized for file: {txt}')
)
return mime_type
[docs]class FileExtension(CleanText):
"""
A filter to extract the file extension from a given string representing
a file name or a file path.
:param default: The default extension to be returned when the file extension is not recognized.
:type default: Any, optional
:param validate_mime: Flag to indicate whether to validate the MIME type of the returned extension.
:type validate_mime: bool, optional
"""
def __init__(self, selector=None, validate_mime=False, default=NO_DEFAULT):
super().__init__(selector, default=default)
self.validate_mime = validate_mime
[docs] @debug()
def filter(self, txt: str) -> Any:
"""
Get the file extension from a file name or path.
:param txt: The file name or path for which to extract the file extension.
:type txt: str
:raises FormatError: If the file extension is not recognized.
>>> FileExtension().filter('file.docx')
'docx'
>>> FileExtension().filter('path/to/file.tar.gz')
'tar.gz'
>>> FileExtension(default='NAN').filter('file_without_extension')
'NAN'
>>> FileExtension().filter('/home/user/Documents/report.pdf')
'pdf'
>>> FileExtension(default='UNKNOWN').filter('spreadsheet')
'UNKNOWN'
>>> FileExtension(default='UNKNOWN', validate_mime=True).filter('path/to/file.dfs')
'UNKNOWN'
>>> FileExtension(default='UNKNOWN', validate_mime=True).filter('file.jpg')
'jpg'
"""
txt = super().filter(txt)
if empty(txt):
return self.default_or_raise(FormatError(f'Unable to parse {txt}'))
if len(txt.split('.')) > 2:
extension = '.'.join(txt.split('.')[-2:])
else:
_, extension = splitext(txt)
if not extension:
return self.default_or_raise(FormatError(f'Extension not recognized: {txt}'))
extension = extension.strip('.')
if self.validate_mime:
try:
MimeType().filter(extension)
except FormatError:
return self.default_or_raise(FormatError(
f'MIME type not recognized for the extension {extension}')
)
return extension