Source code for gendis.genetic

# Standard lib
from collections import defaultdict, Counter
import array
import time

# "Standard" data science libs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Serialization
import pickle

# Motif extraction
from gendis.mstamp_stomp import mstamp as mstamp_stomp

# Evolutionary algorithms framework
from deap import base, creator, algorithms, tools

# Time series operations
from tslearn.clustering import TimeSeriesKMeans
from tslearn.metrics import sigma_gak, cdist_gak
from tslearn.clustering import GlobalAlignmentKernelKMeans
from tslearn.barycenters import euclidean_barycenter

# Parallelization
from pathos.multiprocessing import ProcessingPool as Pool

# ML
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import log_loss

# Util functions
import gendis.util as util

# Ignore warnings
import warnings; warnings.filterwarnings('ignore')

[docs]class GeneticExtractor(BaseEstimator, TransformerMixin): """Feature selection with genetic algorithm. Parameters ---------- population_size : int The number of individuals in our population. Increasing this parameter increases both the runtime per generation, as the probability of finding a good solution. iterations : int The maximum number of generations the algorithm may run. wait : int If no improvement has been found for `wait` iterations, then stop add_noise_prob : float The chance that gaussian noise is added to a random shapelet from a random individual every generation add_shapelet_prob : float The chance that a shapelet is added to a random shapelet set every gen remove_shapelet_prob : float The chance that a shapelet is deleted to a random shapelet set every gen crossover_prob : float The chance that of crossing over two shapelet sets every generation normed : boolean Whether we first have to normalize before calculating distances n_jobs : int The number of threads to use verbose : boolean Whether to print some statistics in every generation plot : object Whether to plot the individuals every generation (if the population size is smaller than or equal to 20), or to plot the fittest individual Attributes ---------- shapelets : array-like The fittest shapelet set after evolution label_mapping: dict A dictionary that maps the labels to the range [0, ..., C-1] Example ------- An example showing genetic shapelet extraction on a simple dataset: >>> from tslearn.generators import random_walk_blobs >>> from genetic import GeneticExtractor >>> from sklearn.linear_model import LogisticRegression >>> import numpy as np >>> np.random.seed(1337) >>> X, y = random_walk_blobs(n_ts_per_blob=20, sz=64, noise_level=0.1) >>> X = np.reshape(X, (X.shape[0], X.shape[1])) >>> extractor = GeneticExtractor(iterations=5, n_jobs=1, population_size=10) >>> distances = extractor.fit_transform(X, y) >>> lr = LogisticRegression() >>> _ = lr.fit(distances, y) >>> lr.score(distances, y) 1.0 """ def __init__(self, population_size=50, iterations=25, verbose=False, normed=False, add_noise_prob=0.4, add_shapelet_prob=0.4, wait=10, plot=None, remove_shapelet_prob=0.4, crossover_prob=0.66, n_jobs=4): # Hyper-parameters self.population_size = population_size self.iterations = iterations self.verbose = verbose self.add_noise_prob = add_noise_prob self.add_shapelet_prob = add_shapelet_prob self.remove_shapelet_prob = remove_shapelet_prob self.crossover_prob = crossover_prob self.plot = plot self.wait = wait self.n_jobs = n_jobs self.normed = normed # Attributes self.label_mapping = {} self.shapelets = [] self._min_length = 0 def _convert_X(self, X): if isinstance(X, list): for i in range(len(X)): X[i] = np.array(X[i]) X = np.array(X) if isinstance(X, pd.DataFrame): X = X.values return X def _convert_y(self, y): # Map labels to [0, ..., C-1] for j, c in enumerate(np.unique(y)): self.label_mapping[c] = j # Use pandas map function and convert to numpy y = np.reshape(pd.Series(y).map(self.label_mapping).values, (-1, 1)) return y def fit(self, X, y): """Extract shapelets from the provided timeseries and labels. Parameters ---------- X : array-like, shape = [n_ts, ] The training input timeseries. Each timeseries must be an array, but the lengths can be variable y : array-like, shape = [n_samples] The target values. """ X = self._convert_X(X) y = self._convert_y(y) self._min_length = min([len(x) for x in X]) if self._min_length <= 4: raise Exception('Time series should be of at least length 4!') # Sci-kit learn check for label vector. check_array(y) # We will try to maximize the negative logloss of LR in CV. # In the case of ties, we pick the one with least number of shapelets weights = (1.0, -1.0) creator.create("FitnessMax", base.Fitness, weights=weights) # Individual are lists (of shapelets (list)) creator.create("Individual", list, fitness=creator.FitnessMax) def random_shapelet(n_shapelets): """Extract a random subseries from the training set""" shaps = [] for _ in range(n_shapelets): rand_row = np.random.randint(X.shape[0]) rand_length = np.random.randint(4, self._min_length) rand_col = np.random.randint(self._min_length - rand_length) shaps.append(X[rand_row][rand_col:rand_col+rand_length]) if n_shapelets > 1: return np.array(shaps) else: return np.array(shaps[0]) def kmeans(n_shapelets, shp_len, n_draw=1000): """Sample subseries from the timeseries and apply K-Means on them""" # Sample `n_draw` subseries of length `shp_len` n_ts, sz = len(X), self._min_length indices_ts = np.random.choice(n_ts, size=n_draw, replace=True) start_idx = np.random.choice(sz - shp_len + 1, size=n_draw, replace=True) end_idx = start_idx + shp_len subseries = np.zeros((n_draw, shp_len)) for i in range(n_draw): subseries[i] = X[indices_ts[i]][start_idx[i]:end_idx[i]] tskm = TimeSeriesKMeans(n_clusters=n_shapelets, metric="euclidean", verbose=False) return tskm.fit(subseries).cluster_centers_ def motif(n_shapelets, n_draw=100): """Extract some motifs from sampled timeseries""" shaps = [] for _ in range(n_shapelets): rand_length = np.random.randint(4, self._min_length) subset_idx = np.random.choice(range(len(X)), size=n_draw, replace=True) ts = [] for idx in subset_idx: ts += list(X[idx].flatten()) matrix_profile, _ = mstamp_stomp(ts, rand_length) motif_idx = matrix_profile[0, :].argsort()[-1] shaps.append(np.array(ts[motif_idx:motif_idx + rand_length])) if n_shapelets > 1: return np.array(shaps) else: return np.array(shaps[0]) def create_individual(n_shapelets=None): """Generate a random shapelet set""" if n_shapelets is None: ub = int(np.sqrt(self._min_length)) + 1 n_shapelets = np.random.randint(2, ub) rand = np.random.random() if n_shapelets > 1: if rand < 1./3.: rand_length = np.random.randint(4, self._min_length) return kmeans(n_shapelets, rand_length) elif 1./3. < rand < 2./3.: return motif(n_shapelets) else: return random_shapelet(n_shapelets) else: if rand < 0.5: return motif(n_shapelets) else: return random_shapelet(n_shapelets) def cost(shapelets): """Calculate the fitness of an individual/shapelet set""" start = time.time() D = np.zeros((len(X), len(shapelets))) for k in range(len(X)): ts = X[k] for j in range(len(shapelets)): if self.normed: dist = util.sdist(shapelets[j].flatten(), ts) else: dist = util.sdist_no_norm(shapelets[j].flatten(), ts) D[k, j] = dist lr = LogisticRegression() skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1337) preds = cross_val_predict(lr, D, y, method='predict_proba', cv=skf) cv_score = -log_loss(y, preds) return (cv_score, sum([len(x) for x in shapelets])) def add_noise(shapelets): """Add random noise to a random shapelet""" rand_shapelet = np.random.randint(len(shapelets)) tools.mutGaussian(shapelets[rand_shapelet], mu=0, sigma=0.1, indpb=0.15) return shapelets, def add_shapelet(shapelets): """Add a shapelet to the individual""" shapelets.append(create_individual(n_shapelets=1)) return shapelets, def remove_shapelet(shapelets): """Remove a random shapelet from the individual""" if len(shapelets) > 1: rand_shapelet = np.random.randint(len(shapelets)) shapelets.pop(rand_shapelet) return shapelets, def merge_crossover(ind1, ind2): """Merge shapelets from one set with shapelets from the other""" # Construct a pairwise similarity matrix using GAK _all = list(ind1) + list(ind2) similarity_matrix = cdist_gak(ind1, ind2, sigma=sigma_gak(_all)) # Iterate over shapelets in `ind1` and merge them with shapelets # from `ind2` for row_idx in range(similarity_matrix.shape[0]): # Remove all elements equal to 1.0 mask = similarity_matrix[row_idx, :] != 1.0 non_equals = similarity_matrix[row_idx, :][mask] if len(non_equals): # Get the timeseries most similar to the one at row_idx max_col_idx = np.argmax(non_equals) ts1 = list(ind1[row_idx]).copy() ts2 = list(ind2[max_col_idx]).copy() # Merge them and remove nans ind1[row_idx] = euclidean_barycenter([ts1, ts2]) ind1[row_idx] = ind1[row_idx][~np.isnan(ind1[row_idx])] # Apply the same for the elements in ind2 for col_idx in range(similarity_matrix.shape[1]): mask = similarity_matrix[:, col_idx] != 1.0 non_equals = similarity_matrix[:, col_idx][mask] if len(non_equals): max_row_idx = np.argmax(non_equals) ts1 = list(ind1[max_row_idx]).copy() ts2 = list(ind2[col_idx]).copy() ind2[col_idx] = euclidean_barycenter([ts1, ts2]) ind2[col_idx] = ind2[col_idx][~np.isnan(ind2[col_idx])] return ind1, ind2 def point_crossover(ind1, ind2): """Apply one- or two-point crossover on the shapelet sets""" if len(ind1) > 1 and len(ind2) > 1: if np.random.random() < 0.5: ind1, ind2 = tools.cxOnePoint(list(ind1), list(ind2)) else: ind1, ind2 = tools.cxTwoPoint(list(ind1), list(ind2)) return ind1, ind2 # Register all operations in the toolbox toolbox = base.Toolbox() if self.n_jobs > 1: pool = Pool(self.n_jobs) toolbox.register("map", pool.map) else: toolbox.register("map", map) # Register all our operations to the DEAP toolbox toolbox.register("merge", merge_crossover) toolbox.register("cx", point_crossover) toolbox.register("mutate", add_noise) toolbox.register("add", add_shapelet) toolbox.register("remove", remove_shapelet) toolbox.register("individual", tools.initIterate, creator.Individual, create_individual) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", cost) # Small tournaments to ensure diversity toolbox.register("select", tools.selTournament, tournsize=3) # Set up the statistics. We will measure the mean, std dev and max stats = tools.Statistics(key=lambda ind: ind.fitness.values[0]) stats.register("avg", np.mean) stats.register("std", np.std) stats.register("max", np.max) # Initialize the population and calculate their initial fitness values pop = toolbox.population(n=self.population_size) fitnesses = list(map(toolbox.evaluate, pop)) for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit # Keep track of the best iteration, in order to do stop after `wait` # generations without improvement it, best_it = 1, 1 best_ind = [] best_score = float('-inf') # Set up a matplotlib figure and set the axes height = int(np.ceil(self.population_size/4)) if self.plot is not None and self.plot != 'notebook': if self.population_size <= 20: f, ax = plt.subplots(4, height, sharex=True) else: plt.figure(figsize=(15, 5)) plt.xlim([0, len(X[0])]) # The genetic algorithm starts here while it <= self.iterations and it - best_it < self.wait: gen_start = time.time() # Clone the population into offspring offspring = list(map(toolbox.clone, pop)) # Plot the fittest individual of our population if self.plot is not None: if self.population_size <= 20: if self.plot == 'notebook': f, ax = plt.subplots(4, height, sharex=True) for ix, ind in enumerate(offspring): ax[ix//height][ix%height].clear() for s in ind: ax[ix//height][ix%height].plot(range(len(s)), s) plt.pause(0.001) if self.plot == 'notebook': plt.show() else: plt.clf() for shap in best_ind: plt.plot(range(len(shap)), shap) plt.pause(0.001) # Iterate over all individuals and apply CX with certain prob for child1, child2 in zip(offspring[::2], offspring[1::2]): try: if np.random.random() < self.crossover_prob: toolbox.merge(child1, child2) del child1.fitness.values del child2.fitness.values if np.random.random() < self.crossover_prob: toolbox.cx(child1, child2) del child1.fitness.values del child2.fitness.values except: raise # Apply mutation to each individual for idx, indiv in enumerate(offspring): if np.random.random() < self.add_noise_prob: toolbox.mutate(indiv) del indiv.fitness.values if np.random.random() < self.add_shapelet_prob: toolbox.add(indiv) del indiv.fitness.values if np.random.random() < self.remove_shapelet_prob: toolbox.remove(indiv) del indiv.fitness.values # Update the fitness values invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Replace population and update hall of fame & statistics new_pop = toolbox.select(offspring, self.population_size - 1) fittest_ind = tools.selBest(pop + offspring, 1) pop[:] = new_pop + fittest_ind it_stats = stats.compile(pop) # Print our statistics if self.verbose: if it == 1: print('it\t\tavg\t\tstd\t\tmax\t\ttime') print('{}\t\t{}\t\t{}\t\t{}\t{}'.format( it, np.around(it_stats['avg'], 4), np.around(it_stats['std'], 3), np.around(it_stats['max'], 6), np.around(time.time() - gen_start, 4), )) # Have we found a new best score? if it_stats['max'] > best_score: best_it = it best_score = it_stats['max'] best_ind = tools.selBest(pop + offspring, 1) it += 1 best_shapelets = [] for shap in best_ind[0]: best_shapelets.append(shap.flatten()) self.shapelets = best_shapelets def transform(self, X): """After fitting the Extractor, we can transform collections of timeseries in matrices with distances to each of the shapelets in the evolved shapelet set. Parameters ---------- X : array-like, shape = [n_ts, ] The training input timeseries. Each timeseries must be an array, but the lengths can be variable Returns ------- D : array-like, shape = [n_ts, n_shaps] The matrix with distances """ X = self._convert_X(X) # Check is fit had been called check_is_fitted(self, ['shapelets']) # Construct (|X| x |S|) distance matrix D = np.zeros((len(X), len(self.shapelets))) for smpl_idx, sample in enumerate(X): for shap_idx, shapelet in enumerate(self.shapelets): if self.normed: dist = util.sdist(shapelet.flatten(), sample) else: dist = util.sdist_no_norm(shapelet.flatten(), sample) D[smpl_idx, shap_idx] = dist return D def fit_transform(self, X, y): """Combine both the fit and transform method in one. Parameters ---------- X : array-like, shape = [n_ts, ] The training input timeseries. Each timeseries must be an array, but the lengths can be variable y : array-like, shape = [n_samples] The target values. Returns ------- D : array-like, shape = [n_ts, n_shaps] The matrix with distances """ # First call fit, then transform self.fit(X, y) D = self.transform(X) return D def save(self, path): """Write away all hyper-parameters and discovered shapelets to disk""" pickle.dump(self, open(path, 'wb+')) @staticmethod def load(path): """Instantiate a saved GeneticExtractor""" return pickle.load(open(path, 'rb'))