Source code for ioapps.wget_interface

"""
Module
------

    wget_interface.py

Description
-----------

    This module contains functions to create and collect internet
    (world-wide web; WWW) files using the Python `wget` package.

Functions
---------

    __check_wget_env__()

        This function checks whether the run-time environment contains
        the `wget` application executable.

    get_webfile(url, path, ignore_missing=False):

        This function collects the specified URL path using the Python
        `wget` package.

    get_weblist(url, path, matchstr=None, remove_webfile=True, ext=None):

        This function collects a list of files beneath the specified
        URL.

Author(s)
---------

    Henry R. Winterbottom; 02 December 2022

History
-------

    2022-12-02: Henry Winterbottom -- Initial implementation.

"""

# ----

# pylint: disable=broad-except
# pylint: disable=consider-using-with
# pylint: disable=too-many-locals
# pylint: disable=unused-argument

# ----

import os
import subprocess
from typing import List

from bs4 import BeautifulSoup
from tools import fileio_interface, system_interface
from utils.exceptions_interface import WgetInterfaceError
from utils.logger_interface import Logger

# ----

# Define all available module properties.
__all__ = ["get_webfile", "get_weblist"]

# ----

logger = Logger(caller_name=__name__)

# ----


def __check_wget_env__() -> str:
    """
    Description
    -----------

    This function checks whether the run-time environment contains the
    `wget` application executable.

    Returns
    -------

    wget_exec: ``str``

        A Python string specifying the path to the `wget` application
        executable.

    Raises
    ------

    WgetInterfaceError:

        - raised if the wget application executable path cannot be
          determined.

    """

    # Check the run-time environment in order to determine the wget
    # application executable path; proceed accordingly.
    wget_exec = system_interface.get_app_path(app="wget")
    if wget_exec is None:
        msg = (
            "The wget application executable could not be determined "
            "from the run-time environment. Aborting!!!"
        )
        raise WgetInterfaceError(msg=msg)

    return wget_exec


# ----


[docs]def get_webfile(url: str, path: str, ignore_missing: bool = False) -> None: """ Description ----------- This function collects the specified URL path using the Python `wget` package. Parameters ---------- url: ``str`` A Python string specifying the path to the internet (world-wide web; WWW) file to be retrieved. path: ``str`` A Python string specifying the local path to which to write the retrieved file. Keywords -------- ignore_missing: ``bool``, optional A Python boolean valued variable specifying whether to ignore missing files (`True`) or raise a CurlErrorInterface exception for missing files (`False`). Raises ------ WgetInterfaceError: - raised if an Exception related to a missing URL path is encountered. """ # Establish the wget application executable path. wget_exec = __check_wget_env__() # Attempt to collect the specified URL path; proceed accordingly. msg = f"Collecting URL path {url}." logger.info(msg=msg) try: # Download the respective URL path. msg = f"Writing collected URL path {url} to local path {path}." logger.info(msg=msg) cmd = [f"{wget_exec}", f"{url}", "-O", f"{path}"] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() proc.wait() except Exception as errmsg: # Proceed accordingly for internet-based file paths that are # missing. if ignore_missing: pass if not ignore_missing: msg = ( f"Collecting of internet path {url} failed with error {errmsg}. " "Aborting!!!" ) raise WgetInterfaceError(msg=msg) from errmsg
# ----
[docs]def get_weblist( url: str, path: str, matchstr: str = None, remove_webfile: bool = True, ext: str = None, ) -> List: """ Description ----------- This function collects a list of files beneath the specified URL. Parameters ---------- url: ``str`` A Python string specifying the path to the internet (world-wide web; WWW) file to be retrieved. path: ``str`` A Python string specifying the local path to which to write the retrieved file. Keywords -------- matchstr: ``str``, optional A Python string specifying a character string for which to search while compiling the list of webfiles; if NoneType on entry, the entire list of files beneath the specified URL will be returned; otherwise, a list of files containing the specified string will be returned. remove_webfile: ``bool``, optional A Python boolean valued variable specifying whether to remove the downloaded file containing the contents of the web directory defined by the specified URL; this value is set as True by default. ext: ``str``, optional A Python string specifying the web filename extension; if NoneType on entry the value defaults to to an empty string. Returns ------- weblist: ``List`` A Python list containing the files beneath the specified URL. Raises ------ WgetInterfaceError: - raised if an Exception is encountered; the respective error message accompanys the message string passed to the WgetInterfaceError class. """ # Establish the wget application executable path. wget_exec = __check_wget_env__() # Attempt to collect the specified list of URL paths; proceed # accordingly. webpage = os.path.join(path, f"{0}.local".format(os.path.basename(url))) try: # Format the URL path; this is to make sure that the URL # string represents a directory tree rather than a file path. if url[-1:] != "/": url = os.path.join(f"{url}", str()) msg = f"Downloading URL {url} to local path {webpage}." logger.info(msg=msg) # Attempt to download the URL path. cmd = [f"{wget_exec}", f"{url}", "-O", f"{webpage}"] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() proc.wait() # Read the contents of the collected URL path. with open(webpage, "rb") as file: webdata = file.read() # Compile a list of all URL paths. soup = BeautifulSoup(webdata, "html.parser") if ext is None: ext = str() webfiles = ( node.get("href") for node in soup.find_all("a") if node.get("href").endswith(ext) ) weblist = [] for webfile in webfiles: weblist.append(webfile) # Removing the specified files. if remove_webfile: filelist = [webpage] msg = f"The following files will be removed: {filelist}" logger.warn(msg=msg) fileio_interface.removefiles(filelist) except Exception as errmsg: msg = ( f"Collection of files available at internet path {url} failed " f"with error {errmsg}. Aborting!!!" ) raise WgetInterfaceError(msg=msg) from errmsg return weblist