Source code for qiskit.aqua.utils.dataset_helper

# -*- coding: utf-8 -*-

# This code is part of Qiskit.
#
# (C) Copyright IBM 2018, 2020.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

""" Data set helper """

import operator
from copy import deepcopy
import numpy as np
from sklearn.decomposition import PCA


[docs]def get_num_classes(dataset): """ Check number of classes in a given dataset Args: dataset(dict): key is the class name and value is the data. Returns: int: number of classes """ return len(list(dataset.keys()))
[docs]def get_feature_dimension(dataset): """ Check feature dimension of a given dataset Args: dataset(dict): key is the class name and value is the data. Returns: int: feature dimension, -1 denotes no data in the dataset. Raises: TypeError: invalid data set """ if not isinstance(dataset, dict): raise TypeError("Dataset is not formatted as a dict. Please check it.") feature_dim = -1 for v in dataset.values(): if not isinstance(v, np.ndarray): v = np.asarray(v) return v.shape[1] return feature_dim
# pylint: disable=invalid-name
[docs]def split_dataset_to_data_and_labels(dataset, class_names=None): """ Split dataset to data and labels numpy array If `class_names` is given, use the desired label to class name mapping, or create the mapping based on the keys in the dataset. Args: dataset (dict): {'A': numpy.ndarray, 'B': numpy.ndarray, ...} class_names (dict): class name of dataset, {class_name: label} Returns: Union(tuple(list, dict), list): List contains two arrays of numpy.ndarray type where the array at index 0 is data, an NxD array, and at index 1 it is labels, an Nx1 array, containing values in range 0 to K-1, where K is the number of classes. The dict is a map {str: int}, mapping class name to label. The tuple of list, dict is returned when `class_names` is not None, otherwise just the list is returned. Raises: KeyError: data set invalid """ data = [] labels = [] if class_names is None: sorted_classes_name = sorted(list(dataset.keys())) class_to_label = {k: idx for idx, k in enumerate(sorted_classes_name)} else: class_to_label = class_names sorted_label = sorted(class_to_label.items(), key=operator.itemgetter(1)) for class_name, _ in sorted_label: values = dataset[class_name] for value in values: data.append(value) try: labels.append(class_to_label[class_name]) except Exception as ex: # pylint: disable=broad-except raise KeyError('The dataset has different class names to ' 'the training data. error message: {}'.format(ex)) data = np.asarray(data) labels = np.asarray(labels) if class_names is None: return [data, labels], class_to_label else: return [data, labels]
[docs]def map_label_to_class_name(predicted_labels, label_to_class): """ Helper converts labels (numeric) to class name (string) Args: predicted_labels (numpy.ndarray): Nx1 array label_to_class (dict or list): a mapping form label (numeric) to class name (str) Returns: str: predicted class names of each datum """ if not isinstance(predicted_labels, np.ndarray): predicted_labels = np.asarray([predicted_labels]) predicted_class_names = [] for predicted_label in predicted_labels: predicted_class_names.append(label_to_class[predicted_label]) return predicted_class_names
[docs]def reduce_dim_to_via_pca(x, dim): """ Reduce the data dimension via pca Args: x (numpy.ndarray): NxD array dim (int): the targeted dimension D' Returns: numpy.ndarray: NxD' array """ x_reduced = PCA(n_components=dim).fit_transform(x) return x_reduced
def discretize_and_truncate(data, bounds, num_qubits, return_data_grid_elements=False, return_prob=False, prob_non_zero=True): """ Discretize & truncate classical data to enable digital encoding in qubit registers whereby the data grid is [[grid elements dim 0],..., [grid elements dim k]] Args: data (list or array or np.array): training data (int or float) of dimension k bounds (list or array or np.array): k min/max data values [[min_0,max_0],...,[min_k-1,max_k-1]] if univariate data: [min_0,max_0] num_qubits (list or array or np.array): k numbers of qubits to determine representation resolution, i.e. n qubits enable the representation of 2**n values [num_qubits_0,..., num_qubits_k-1] return_data_grid_elements (Bool): if True - return an array with the data grid elements return_prob (Bool): if True - return a normalized frequency count of the discretized and truncated data samples prob_non_zero (Bool): if True - set 0 values in the prob_data to 10^-1 to avoid potential problems when using the probabilities in loss functions - division by 0 Returns: array: discretized and truncated data array: data grid [[grid elements dim 0],..., [grid elements dim k]] array: grid elements, Product_j=0^k-1 2**num_qubits_j element vectors array: data probability, normalized frequency count sorted from smallest to biggest element """ # Truncate the data if np.ndim(bounds) == 1: bounds = np.reshape(bounds, (1, len(bounds))) data = data.reshape((len(data), len(num_qubits))) temp = [] for i, data_sample in enumerate(data): append = True for j, entry in enumerate(data_sample): if entry < bounds[j, 0]: append = False if entry > bounds[j, 1]: append = False if append: temp.append(list(data_sample)) data = np.array(temp) # Fit the data to the data element grid for j, prec in enumerate(num_qubits): data_row = data[:, j] # dim j of all data samples # prepare element grid for dim j elements_current_dim = np.linspace(bounds[j, 0], bounds[j, 1], (2 ** prec)) # find index for data sample in grid index_grid = np.searchsorted( elements_current_dim, data_row - (elements_current_dim[1] - elements_current_dim[0]) * 0.5) for k, index in enumerate(index_grid): data[k, j] = elements_current_dim[index] if j == 0: if len(num_qubits) > 1: data_grid = [elements_current_dim] else: data_grid = elements_current_dim grid_elements = elements_current_dim elif j == 1: temp = [] for grid_element in grid_elements: for element_current in elements_current_dim: temp.append([grid_element, element_current]) grid_elements = temp data_grid.append(elements_current_dim) else: temp = [] for grid_element in grid_elements: for element_current in elements_current_dim: temp.append(deepcopy(grid_element).append(element_current)) grid_elements = deepcopy(temp) data_grid.append(elements_current_dim) data_grid = np.array(data_grid) data = np.reshape(data, (len(data), len(data[0]))) if return_prob: if np.ndim(data) > 1: prob_data = np.zeros(int(np.prod(np.power(np.ones(len(data[0])) * 2, num_qubits)))) else: prob_data = np.zeros(int(np.prod(np.power(np.array([2]), num_qubits)))) for data_element in data: for i, element in enumerate(grid_elements): if all(data_element == element): prob_data[i] += 1 / len(data) if prob_non_zero: # add epsilon to avoid 0 entries which can be problematic in loss functions (division) prob_data = [1e-10 if x == 0 else x for x in prob_data] if return_data_grid_elements: return data, data_grid, grid_elements, prob_data else: return data, data_grid, prob_data else: if return_data_grid_elements: return data, data_grid, grid_elements else: return data, data_grid