Source code for ioapps.curl_interface

"""
Module
------

    curl_interface.py

Description
-----------

    This module contains functions to create and collect internet
    (world-wide web; WWW) files using the respective platform curl
    application executable..

Functions
---------

    __check_curl_env_()

        This function checks whether the run-time environment contains
        the curl application executable; if not, a CurlInterfaceError
        exception will be raised; if so, the path to the curl
        executable will be defined and returned.

    get_webfile(url, path, ignore_missing=False)

        This function collects the specified URL path using the
        respective platform curl application executable.

    get_weblist(url, ext=None):

        This function collects a list of files beneath the specified
        URL.

Requirements
------------

- bs4; https://www.crummy.com/software/BeautifulSoup/

Author(s)
---------

    Henry R. Winterbottom; 29 November 2022

History
-------

    2022-11-29: Henry Winterbottom -- Initial implementation.

"""

# ----

# pylint: disable=broad-except
# pylint: disable=consider-using-with

# ----

import os
import subprocess
from typing import List

import requests
from bs4 import BeautifulSoup
from tools import system_interface
from utils.exceptions_interface import CurlInterfaceError
from utils.logger_interface import Logger

# ----

# Define all available module properties.
__all__ = ["get_webfile", "get_weblist"]

# ----

logger = Logger(caller_name=__name__)

# ----


def __check_curl_env__() -> str:
    """
    Description
    -----------

    This function checks whether the run-time environment contains the
    curl application executable; if not, an CurlInterfaceError
    exception will be raised; if so, the path to the curl executable
    will be defined and returned.

    Returns
    -------

    curl_exec: ``str``

        A Python string specifying the path to the curl application
        executable.

    Raises
    ------

    CurlInterfaceError:

        - raised if the curl application executable path cannot be
          determined.

    """

    # Check the run-time environment in order to determine the curl
    # application executable path.
    curl_exec = system_interface.get_app_path(app="curl")
    if curl_exec is None:
        msg = (
            "The curl application executable could not be determined "
            "from the run-time environment. Aborting!!!"
        )
        raise CurlInterfaceError(msg=msg)

    return curl_exec


# ----


[docs]def get_webfile(
    url: str, path: str, local_filename: str = None, ignore_missing: bool = False
) -> None:
    """
    Description
    -----------

    This function collects the specified URL path using the respective
    platform curl application executable.

    Parameters
    ----------

    url: ``str``

        A Python string specifying the path to the internet
        (world-wide web; WWW) file to be retrieved.

    path: ``str``

        A Python string specifying the local path to which to write
        the retrieved file.

    Keywords
    --------

    local_filename: ``str``, optional

        A Python string specifying the basename path to which the
        collected URL path is to be written; the respective (renamed)
        URL will be written to the directory in which the application
        was launched.

    ignore_missing: ``bool``, optional

        A Python boolean valued variable specifying whether to ignore
        missing files (`True`) or raise a CurlInterfaceError exception
        for missing files (`False`).

    Raises
    ------

    CurlInterfaceError:

        - raised if an exception is raised related to a missing URL
          path is encountered.

    """

    # Establish the curl application executable path.
    curl_exec = __check_curl_env__()

    # Collect the internet-based file and proceed accordingly.
    msg = f"Collecting URL path {url}."
    logger.info(msg=msg)
    try:
        # Define the current working directory.
        cwd = os.getcwd()

        # Check whether to rename the download file; proceed
        # accordingly.
        if local_filename is None:
            # Define the standard output stream and the curl
            # application executable command line arguments.
            msg = f"Writing collected URL path {url} to local path {path}."
            stdout = subprocess.PIPE
            cmd = [f"{curl_exec}", "-C", "-", "-O", url]
        if local_filename is not None:
            # Open the output file and proceed accordingly.
            local_filepath = os.path.join(cwd, local_filename)
            msg = f"Writing collected URL path {url} and writing to path {local_filepath}."
            stdout = open(f"{local_filepath}", "wb")
            cmd = [f"{curl_exec}", "-o", local_filepath, url]

        # Collect the URL path(s).
        logger.info(msg=msg)
        proc = subprocess.Popen(cmd, stdout=stdout, stderr=subprocess.PIPE)
        proc.communicate()
        proc.wait()

        # Close the output file; proceed accordingly.
        try:
            stdout.close()
        except AttributeError:
            pass
        os.chdir(cwd)

    except Exception as errmsg:
        # Proceed accordingly for internet-based file paths that are
        # missing.
        if ignore_missing:
            pass
        if not ignore_missing:
            msg = (
                f"Collecting of internet path {url} failed with error {errmsg}. "
                "Aborting!!!"
            )
            raise CurlInterfaceError(msg=msg) from errmsg


# ----


[docs]def get_weblist(url: str, ext: str = None) -> List:
    """
    Description
    -----------

    This function collects a list of files beneath the specified URL.

    Parameters
    ----------

    url: ``str``

        A Python string specifying the path to the internet
        (world-wide web; WWW) file to be retrieved.

    Keywords
    --------

    ext: ``str``, optional

        A Python string specifying the web filename extension; if
        NoneType on entry the value defaults to to an empty string.

    Returns
    -------

    weblist: ``List``

        A Python list containing the files beneath the specified URL.

    Raises
    ------

    CurlInterfaceError:

        - raised if an Exception is encountered while attempting to
          compile a list of files beneath the specified URL.

    """

    # Collect a list of files beneath the specified URL path; proceed
    # accordingly.
    try:
        # Define the URL path and parse the retrieved file.
        webpage = requests.get(url=url, timeout=None).text
        soup = BeautifulSoup(webpage, "html.parser")

        # Compile a list of all URL paths.
        if ext is None:
            ext = str()
        webfiles = (
            url + "/" + node.get("href")
            for node in soup.find_all("a")
            if node.get("href").endswith(ext)
        )
        weblist = []
        for webfile in webfiles:
            weblist.append(webfile)
    except Exception as errmsg:
        msg = (
            f"Collection of files available at internet path {url} failed "
            f"with error {errmsg}. Aborting!!!"
        )
        raise CurlInterfaceError(msg=msg) from errmsg

    return weblist