Source code for construe.datasets.path

"""
Path handling for downloads
"""

import os
import shutil

from pathlib import Path
from ..cloud.signature import sha256sum
from construe.exceptions import DatasetsError


# Fixtures is where data being prepared is stored
FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures")
MANIFEST = os.path.join(os.path.dirname(__file__), "manifest.json")

# Data dir is the location of downloaded datasets
DATADIR = Path.home() / ".construe" / "data"

# Names of the datasets
DIALECTS = "dialects"
LOWLIGHT = "lowlight"
REDDIT = "reddit"
MOVIES = "movies"
ESSAYS = "essays"
AEGIS = "aegis"
NSFW = "nsfw"



[docs]
def get_data_home(path=None):
    """
    Return the path of the Construe data directory. This folder is used by
    dataset loaders to avoid downloading data several times.

    By default, this folder is colocated with the code in the install directory
    so that data shipped with the package can be easily located. Alternatively
    it can be set by the ``$CONSTRUE_DATA`` environment variable, or
    programmatically by giving a folder path. Note that the ``'~'`` symbol is
    expanded to the user home directory, and environment variables are also
    expanded when resolving the path.
    """
    if path is None:
        path = os.environ.get("CONSTRUE_DATA", DATADIR)

    path = os.path.expanduser(path)
    path = os.path.expandvars(path)

    if not os.path.exists(path):
        os.makedirs(path)

    return path




[docs]
def find_dataset_path(dataset, data_home=None, fname=None, ext=None, raises=True):
    """
    Looks up the path to the dataset specified in the data home directory,
    which is found using the ``get_data_home`` function. By default data home
    is in a config directory in the user's home folder, but can be modified with the
    $CONSTRUE_DATA environment variable, or passing in a different directory.

    If the dataset is not found a ``DatasetsError`` is raised by default.
    """
    # Figure out the root directory of the datasets
    data_home = get_data_home(data_home)

    # Figure out the relative path to the dataset
    if fname is None:
        if ext is None:
            path = os.path.join(data_home, dataset)
        else:
            path = os.path.join(data_home, dataset, "{}{}".format(dataset, ext))
    else:
        path = os.path.join(data_home, dataset, fname)

    # Determine if the path exists
    if not os.path.exists(path):

        # Suppress exceptions if required
        if not raises:
            return None

        raise DatasetsError(
            f"could not find dataset at {path} - does it need to be downloaded?"
        )

    return path




[docs]
def dataset_exists(dataset, data_home=None):
    """
    Checks to see if a directory with the name of the specified dataset exists
    in the data home directory, found with ``get_data_home``.
    """
    data_home = get_data_home(data_home)
    path = os.path.join(data_home, dataset)

    return os.path.exists(path) and os.path.isdir(path)




[docs]
def dataset_archive(dataset, signature, data_home=None, ext=".zip"):
    """
    Checks to see if the dataset archive file exists in the data home directory,
    found with ``get_data_home``. By specifying the signature, this function
    also checks to see if the archive is the latest version by comparing the
    sha256sum of the local archive with the specified signature.
    """
    data_home = get_data_home(data_home)
    path = os.path.join(data_home, dataset + ext)

    if os.path.exists(path) and os.path.isfile(path):
        return sha256sum(path) == signature

    return False




[docs]
def cleanup_dataset(dataset, data_home=None, ext=".zip"):
    """
    Removes the dataset directory and archive file from the data home directory.
    """
    removed = 0
    data_home = get_data_home(data_home)

    # Paths to remove
    datadir = os.path.join(data_home, dataset)
    archive = os.path.join(data_home, dataset + ext)

    # Remove directory and contents
    if os.path.exists(datadir):
        shutil.rmtree(datadir)
        removed += 1

    # Remove the archive file
    if os.path.exists(archive):
        os.remove(archive)
        removed += 1

    return removed