"""
Module
------
url_interface.py
Description
-----------
This module contains functions to parse a Uniform Resource Locator
(URL) paths.
Functions
---------
get_contents(url, fail_nonread=False, fail_schema=False,
timeout=10)
This function attempts to collect the contents of a URL path
`url` specified upon entry.
get_weblist(url, ext=None, include_dirname=False)
This function builds a list of files beneath the specified URL
file path.
read_webfile(url, ignore_missing=False, split=None, return_string=False)
This function collects the contents of a specified URL path
and returns a Python list containing the respective contents.
Requirements
------------
- bs4; https://www.crummy.com/software/BeautifulSoup/
- urllib; https://github.com/python/cpython/tree/3.10/Lib/urllib/
Author(s)
---------
Henry R. Winterbottom; 02 December 2022
History
-------
2022-12-02: Henry Winterbottom -- Initial implementation.
"""
# ----
# pylint: disable=broad-except
# ----
import os
import urllib.request
from typing import List, Union
import requests
from bs4 import BeautifulSoup
from requests.exceptions import MissingSchema
from utils.exceptions_interface import URLInterfaceError
from utils.logger_interface import Logger
# ----
# Define all available module properties.
__all__ = ["get_contents", "get_weblist", "read_webfile"]
# ----
logger = Logger(caller_name=__name__)
# ----
[docs]def get_contents(
url: str, fail_nonread: bool = False, fail_schema: bool = False, timeout: int = 10
) -> Union[str, None]:
"""
Description
-----------
This function attempts to collect the contents of a URL path `url`
specified upon entry.
Parameters
----------
url: ``str``
A Python string specifying the URL path contents to be
collected.
Keywords
--------
fail_nonread: ``bool``, optional
A Python boolean valued variable specifying whether to fail
when a URL path is non-readable and/or does not contain
readable contents.
fail_schema: ``bool``, optional
A Python boolean valued variable specifying whether to fail if
a MissingSchema exception is raised by the requests package.
timeout: ``int``, optional
A Python integer value specifying the duration period for
which to allow the URL request to be valid.
Returns
-------
data: ``Union[str, None]``
A Python string containing the contents of the URL path `url`
specified upon entry; if the contents are unable to be
collected, and the keyword parameter arguments are specified
accordingly, NoneType is returned.
Raises
------
URLInterfaceError:
- raised if the URL path is non-readable and `fail_nonread` is
`True` upon entry.
- raised if the schema for the URL path could not be
determined and `fail_schema` is `True` upon entry.
"""
# Initialize the output string.
contents = None
# Parse the URL path and collect the contents of the respective
# URL; proceed acccordingly.
try:
request = requests.get(url, stream=True, timeout=timeout)
if "Content-Length" in request.headers:
msg = f"Collecting contents from URL {url}."
logger.info(msg=msg)
url_req = urllib.request.Request(url)
with urllib.request.urlopen(url_req) as url_resp:
contents = url_resp.read().decode("utf-8")
else:
if fail_nonread:
msg = f"The URL path {url} is a non-readable path. Aborting!!!"
raise URLInterfaceError(msg=msg)
if not fail_nonread:
msg = f"The URL path {url} is a non-readable path; returning NoneType."
logger.warn(msg=msg)
except MissingSchema as exc:
if fail_schema:
msg = f"The schema for URL path {url} could not be determined. Aborting!!!"
raise URLInterfaceError(msg=msg) from exc
if not fail_schema:
msg = (
f"The schema for URL path {url} could not be determined; returning "
"NoneType."
)
logger.warn(msg=msg)
return contents
# ----
[docs]def get_weblist(url: str, ext: str = None, include_dirname: bool = False) -> List:
"""
Description
-----------
This function builds a list of files beneath the specified URL
file path.
Parameters
----------
url: ``str``
A Python string specifying the path to the internet
(world-wide web; WWW) file to be retrieved.
Keywords
--------
ext: ``str``, optional
A Python string specifying the web filename extension; if
NoneType on entry the value defaults to to an empty string.
include_dirname: ``bool``, optional
A Python boolean valued variable specifying whether to append
the URL path directory name to the retrieved file names; if
`False` upon entry, the retrieved files will simply be the
basename for the respective retrieved file names.
Returns
-------
weblist: ``List``
A Python list containing the files beneath the specified URL.
Raises
------
URLInterfaceError:
- raised if an Exception is encountered while attempting to
parse the URL path contents; the respective error message
accompanys the message string passed to the URLError class.
"""
# Collect the contents of the URL file path into memory and parse
# the contents of the URL file path; proceed accordingly.
try:
request = urllib.request.Request(url=url)
with urllib.request.urlopen(url=request) as response:
url_contents = response.read()
soup = BeautifulSoup(url_contents, "html.parser")
except Exception as errmsg:
msg = f"Retrieving the URL path {url} failed with error {errmsg}. Aborting!!!"
raise URLInterfaceError(msg=msg) from errmsg
# Compile a list of all URL file paths beneath the respective URL
# file path provided upon entry; compile a list of the respective
# files in accordance with the function attributes provided upon
# entry; proceed accordingly.
try:
if ext is None:
ext = str()
webfiles = (
node.get("href")
for node in soup.find_all("a")
if node.get("href").endswith(ext)
)
weblist = []
for webfile in webfiles:
if include_dirname:
filename = os.path.join(os.path.dirname(url), webfile)
if not include_dirname:
filename = webfile
weblist.append(filename)
except Exception as errmsg:
msg = (
f"Compilation of URL paths beneath URL {url} failed with "
f"error {errmsg}. Aborting!!!"
)
raise URLInterfaceError(msg=msg) from errmsg
return weblist
# ----
[docs]def read_webfile(
url: str,
ignore_missing: bool = False,
split: str = None,
return_string: bool = False,
) -> List:
"""
Description
-----------
This function collects the contents of a specified URL path and
returns a Python list containing the respective contents.
Parameters
----------
url: ``str``
A Python string specifying the path to the internet
(world-wide web; WWW) file to be retrieved.
Keywords
--------
ignore_missing: ``bool``, optional
A Python boolean valued variable specifying whether to ignore
URL path requests that raise `urllib.error.HTTPError`; if
`True` upon entry the returned list (see below) will be an
empty list.
split: ``str``, optional
A Python string specifying the string/characters to be used to
split the contents of the respective file.
return_string: ``bool``, optional
A Python boolean valued variable specifying whether to return
the contents of the URL path as a string; if `False` upon
entry, the default format of the file (typically bytes) will
be returned.
Returns
-------
contents: ``List``
A Python list containing the contents of the specified URL
path.
Raises
------
URLInterfaceError:
- raised if an exception is encountered while establishing the
URL path request.
- raised if the opening the specified URL path fails due to a
missing endpoint; raised only if ignore_missing is `False`
upon entry.
- raised if an exception is encountered while parsing the
contents of the URL file path specified upon entry.
"""
# Establish a connection to the specified URL file path; proceed
# accordingly.
try:
request = urllib.request.Request(url=url)
except Exception as errmsg:
msg = (
f"Retrieving the URL path {url} failed with error {errmsg}. " "Aborting!!!"
)
raise URLInterfaceError(msg=msg) from errmsg
# Read the contents of the URL file path; proceed accordingly.
try:
# Open the URL path and collect the contents of the file; the
# contents will be returned as strings if return_string is
# True upon entry; otherwise the default format of the file is
# returned.
contents = []
try:
with urllib.request.urlopen(url=request) as response:
contents = response.read()
if return_string:
contents = str(contents)
if split is not None:
contents = str(contents).split(split)
# If an urllib.error.HTTPError exception is raised (i.e., the
# URL path does not exist), proceed in accordance with the
# attributes provided upon entry.
except urllib.error.HTTPError as url_error:
if ignore_missing:
msg = (
f"Opening URL {url} path failed with error {url_error}; "
"collection of URL path contents will not be "
"performed."
)
logger.warn(msg=msg)
if not ignore_missing:
msg = (
f"Opening URL path {url} failed with error {url_error}. "
"Aborting!!!"
)
raise URLInterfaceError(msg=msg) from errmsg
except Exception as errmsg:
msg = (
f"Reading the contents of URL path {url} failed with error "
f"{errmsg}. Aborting!!!"
)
raise URLInterfaceError(msg=msg) from errmsg
return contents