try:
import requests
except:
print("[WARNING] Could not import requests in clustpy.data._utils. Please install requests by 'pip install requests' if necessary")
import numpy as np
import urllib.request
import os
from pathlib import Path
import ssl
from PIL import Image
DEFAULT_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_datafiles")
def _get_download_dir(downloads_path: str) -> str:
"""
Helper function to define the path where the data files should be stored. If downloads_path is None then default path
'[USER]/Downloads/clustpy_datafiles' will be used. If the directory does not exists it will be created.
Parameters
----------
downloads_path : str
path to the directory where the data will be stored. Can be None
Returns
-------
downloads_path : str
path to the directory where the data will be stored. If input was None this will be equal to
'[USER]/Downloads/clustpy_datafiles'
"""
if downloads_path is None:
env_data_path = os.environ.get("CLUSTPY_DATA", None)
if env_data_path is None:
downloads_path = DEFAULT_DOWNLOAD_PATH
else:
downloads_path = env_data_path
if not os.path.isdir(downloads_path):
os.makedirs(downloads_path)
with open(downloads_path + "/info.txt", "w") as f:
f.write("This directory was created by the ClustPy python package to store real world data sets.\n"
"The default directory is '[USER]/Downloads/clustpy_datafiles' and can be changed with the "
"'downloads_path' parameter when loading a data set.\n"
"Alternatively, a global python environment variable for the path can be defined with os.environ['CLUSTPY_DATA'] = 'PATH'.")
return downloads_path
def _download_file(file_url: str, filename_local: str) -> None:
"""
Helper function to download a file into a specified location.
Parameters
----------
file_url : str
URL of the file
filename_local : str
local name of the file after it has been downloaded
"""
print("Downloading data set from {0} to {1}".format(file_url, filename_local))
default_ssl = ssl._create_default_https_context
ssl._create_default_https_context = ssl._create_unverified_context
urllib.request.urlretrieve(file_url, filename_local)
ssl._create_default_https_context = default_ssl
def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_size: int = 32768) -> None:
"""
Download a file from google drive.
Code taken from:
https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
Parameters
----------
file_id : str
ID of the file on google drive
filename_local : str
local name of the file after it has been downloaded
chunk_size : int
chink size when downloading the file (default: 32768)
"""
print("Downloading data set {0} from Google Drive to {1}".format(file_id, filename_local))
URL = "https://drive.google.com/uc"
session = requests.Session()
response = session.get(URL, params={"id": file_id, "confirm": "t"}, stream=True)
if response.text.startswith("<!DOCTYPE"):
# Large files can not be obtained automatically but need a second request
try:
URL_extracted = response.text.split("download-form\" action=\"")[1].split("\" method=\"get\"")[0]
uuid = response.text.split("name=\"uuid\" value=\"")[1].split("\">")[0]
except:
raise Exception("[ERROR] New URL and UUID could not be extracted from first request in _download_file_from_google_drive")
response = session.get(URL_extracted, params={"id": file_id, "confirm": "t", "uuid": uuid}, stream=True)
with open(filename_local, "wb") as f:
for chunk in response.iter_content(chunk_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
session.close()
def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> (
np.ndarray, np.ndarray):
"""
Helper function to load a data file. Either the first or last column, depending on last_column_are_labels, of the
data file is used as the label column.
If file does not exist on the local machine it will be downloaded.
Parameters
----------
filename_local : str
local name of the file after it has been downloaded
file_url : str
URL of the file
delimiter : str
delimiter in the data file (default: ";")
last_column_are_labels : bool
specifies if the last column contains the labels. If false labels should be contained in the first column (default: True)
Returns
-------
data, labels : (np.ndarray, np.ndarray)
the data numpy array, the labels numpy array
"""
if not os.path.isfile(filename_local):
_download_file(file_url, filename_local)
datafile = np.genfromtxt(filename_local, delimiter=delimiter)
if last_column_are_labels:
data = datafile[:, :-1]
labels = datafile[:, -1]
else:
data = datafile[:, 1:]
labels = datafile[:, 0]
# Convert labels to int32 format
labels = labels.astype(np.int32)
return data, labels
def _decompress_z_file(filename: str, directory: str) -> bool:
"""
Helper function to decompress a 7z file. The function uses an installed version of 7zip to decompress the file.
If 7zip is not installed on this machine, the function will return False and a warning is printed.
Parameters
----------
filename : str
name of the file that should be decompressed
directory : str
directory of the file that should be decompressed
Returns
-------
successful : bool
True if decompression was successful, else False
"""
os.system("7z x {0} -o{1}".format(filename.replace("\\", "/"), directory.replace("\\", "/")))
successful = True
if not os.path.isfile(filename[:-2]):
# If no file without .z exists, decompression was not successful
successful = False
print("[WARNING] 7Zip is needed to uncompress *.Z files!")
return successful
def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.ndarray:
"""
Load image and convert it into a coherent size. Returns a numpy array containing the image data.
Parameters
----------
image : str
Path to the image. Can also be a numpy array containing the specific pixels
image_size : tuple
images of various sizes can be converted into a coherent size.
The tuple equals (width, height) of the images.
Can also be None if the image size should not be changed
color_image : bool
Specifies if the loaded image is a color image
Returns
-------
image_data : np.ndarray
The numpy array containing the image data
"""
if type(image) is str:
pil_image = Image.open(image)
else:
pil_image = Image.fromarray(np.uint8(image))
if color_image:
pil_image = pil_image.convert("RGB")
# Convert to coherent size
if image_size is not None:
pil_image = pil_image.resize(image_size)
image_data = np.asarray(pil_image)
assert image_size is None or image_data.shape == (
image_size[0], image_size[1], 3), "Size of image is not correct. Should be {0} but is {1}".format(image_size,
image_data.shape)
return image_data
[docs]def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
"""
Convert data array from image to numerical vector.
Before flattening, color images will be converted to the HWC/HWDC (height, width, color channels) format.
Parameters
----------
data : np.ndarray
The given data set
format : str
Format of the images with the data array. Can be: "HW", "HWD", "CHW", "CHWD", "HWC", "HWDC".
Abbreviations stand for: H: Height, W: Width, D: Depth, C: Color-channels
Returns
-------
data : np.ndarray
The flatten data array
"""
format_possibilities = ["HW", "HWD", "CHW", "CHWD", "HWC", "HWDC"]
assert format in format_possibilities, "Format must be within {0}".format(format_possibilities)
if format == "HW":
assert data.ndim == 3
elif format in ["HWD", "CHW", "HWC"]:
assert data.ndim == 4
elif format in ["CHWD", "HWDC"]:
assert data.ndim == 5
# Flatten shape
if format != "HW" and format != "HWD":
if format == "CHW":
# Change representation to HWC
data = np.transpose(data, [0, 2, 3, 1])
elif format == "CHWD":
# Change representation to HWDC
data = np.transpose(data, [0, 2, 3, 4, 1])
assert data.shape[
-1] == 3, "Color-channels must be in the last position and contain three channels not {0} ({1})".format(
data.shape[-1], data.shape)
data = data.reshape(data.shape[0], -1)
return data
[docs]def unflatten_images(data_flatten: np.ndarray, image_size: tuple) -> np.ndarray:
"""
Convert data array from numerical vector to image.
After unflattening, color images will be converted to the CHW/CHWD (color channels, height, width) format.
Parameters
----------
data_flatten : np.ndarray
The given flatten data set
image_size : str
The size of a single image, e.g., (28,28,3) for a colored image of size 28 x 28
Returns
-------
data_image : np.ndarray
The unflatten data array corresponding to an image
"""
new_shape = tuple([-1] + [i for i in image_size])
data_image = data_flatten.reshape(new_shape)
# Change image from HWC/HWDC to CHW/CHWD
if data_image.ndim == 4 and image_size[-1] == 3:
data_image = np.transpose(data_image, (0, 3, 1, 2))
elif data_image.ndim == 5 and image_size[-1] == 3:
data_image = np.transpose(data_image, (0, 4, 1, 2, 3))
return data_image