[1]:

%load_ext autoreload
%autoreload 2

[26]:

import warnings
warnings.filterwarnings('ignore') # to remove gensim warning

Auto clustering¶

This notebook will explain the auto-clustering capabilities of aikit.

It shows the several things involved. If you just want to run it you should use the automl launcher

Custom random search

[3]:

import pandas as pd
from sklearn.datasets import load_iris

[4]:

iris = load_iris()

[5]:

X = iris.data
y = iris.target

[6]:

X = pd.DataFrame(X, columns=iris.feature_names)
X.head()

[6]:

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

[7]:

y = pd.DataFrame(y, columns=['label'])

[9]:

from aikit.ml_machine import AutoMlConfig, JobConfig,  MlJobManager, MlJobRunner, AutoMlResultReader
from aikit.ml_machine import FolderDataPersister, SavingType, AutoMlModelGuider

AutoML configuration object¶

This object will contain all the relevant information about the problem at hand : * it’s type : REGRESSION or CLASSIFICATION * the information about the column in the data * the steps that are needed in the processing pipeline (see explanation after) * the models that are to be tested * …

By default the model will guess everything but everything can be changed if needed

If y is set to None, it will guess that it is a clustering problem.

[10]:

from aikit.ml_machine.ml_machine_registration import MODEL_REGISTER

[11]:

auto_ml_config = AutoMlConfig(dfX=X, y=None, name = "iris")
auto_ml_config.guess_everything()

[11]:

<aikit.ml_machine.ml_machine.AutoMlConfig object at 0x12eb44630>
type of problem : CLUSTERING

[12]:

auto_ml_config.type_of_problem

[12]:

'CLUSTERING'

[13]:

pd.DataFrame(auto_ml_config.columns_informations).T

[13]:

	HasMissing	ToKeep	TypeOfVariable
sepal length (cm)	False	True	NUM
sepal width (cm)	False	True	NUM
petal length (cm)	False	True	NUM
petal width (cm)	False	True	NUM

[14]:

auto_ml_config.needed_steps

[14]:

[{'step': 'Scaling', 'optional': True},
 {'step': 'DimensionReduction', 'optional': True},
 {'step': 'FeatureExtraction', 'optional': True},
 {'step': 'FeatureSelection', 'optional': True},
 {'step': 'Model', 'optional': False}]

[15]:

auto_ml_config.models_to_keep

[15]:

[('TextEncoder', 'CountVectorizerWrapper'),
 ('TextEncoder', 'Word2VecVectorizer'),
 ('TextEncoder', 'Char2VecVectorizer'),
 ('TextPreprocessing', 'TextNltkProcessing'),
 ('TextPreprocessing', 'TextDefaultProcessing'),
 ('TextPreprocessing', 'TextDigitAnonymizer'),
 ('CategoryEncoder', 'NumericalEncoder'),
 ('MissingValueImputer', 'NumImputer'),
 ('DimensionReduction', 'TruncatedSVDWrapper'),
 ('DimensionReduction', 'PCAWrapper'),
 ('TextDimensionReduction', 'TruncatedSVDWrapper'),
 ('DimensionReduction', 'KMeansTransformer'),
 ('Scaling', 'CdfScaler'),
 ('Model', 'KMeansWrapper'),
 ('Model', 'AgglomerativeClusteringWrapper'),
 ('Model', 'DBSCANWrapper')]

Manual clustering pipeline¶

[16]:

from aikit.pipeline import GraphPipeline

[17]:

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

[18]:

gpipeline = GraphPipeline(models={"scaler": StandardScaler(),
                                  "kmeans": KMeans(n_clusters=3)},
                         edges=[("scaler", "kmeans")])

gpipeline.fit(X)

[18]:

GraphPipeline(edges=[('scaler', 'kmeans')],
              models={'kmeans': KMeans(algorithm='auto', copy_x=True,
                                       init='k-means++', max_iter=300,
                                       n_clusters=3, n_init=10, n_jobs=None,
                                       precompute_distances='auto',
                                       random_state=None, tol=0.0001,
                                       verbose=0),
                      'scaler': StandardScaler(copy=True, with_mean=True,
                                               with_std=True)},
              no_concat_nodes=None, verbose=False)

[19]:

labels = gpipeline.predict(X)

[20]:

labels

[20]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2], dtype=int32)

[27]:

from aikit.cross_validation import score_from_params_clustering
cv_result = score_from_params_clustering(gpipeline, X,
                             y=None,
                             scoring=["silhouette", 'calinski_harabaz'],
                             verbose=1)
cv_result

[27]:

	test_silhouette	test_calinski_harabaz	fit_time	score_time
0	0.506153	505.957631	0.016606	0.002193

[28]:

cv_result = score_from_params_clustering(KMeans(n_clusters=3), X,
                             y=None,
                             scoring=["silhouette", 'calinski_harabaz'],
                             verbose=1)
cv_result

[28]:

	test_silhouette	test_calinski_harabaz	fit_time	score_time
0	0.552819	561.627757	0.017693	0.002682

Auto-ML¶

[ ]:

job_config = JobConfig()
job_config.guess_scoring(auto_ml_config = auto_ml_config)

job_config.score_base_line = None

[ ]:

job_config.scoring

[ ]:

base_folder = # INSERT PATH HERE
data_persister = FolderDataPersister(base_folder = base_folder)

[ ]:

result_reader = AutoMlResultReader(data_persister)
auto_ml_guider = AutoMlModelGuider(result_reader = result_reader,
                                       job_config = job_config,
                                       metric_transformation="default",
                                       avg_metric=True
                                       )

job_controller = MlJobManager(auto_ml_config = auto_ml_config,
                                job_config = job_config,
                                auto_ml_guider = auto_ml_guider,
                                data_persister = data_persister)

[ ]:

job_runner = MlJobRunner(dfX = X ,
                       y = None,
                       groups = None,
                       auto_ml_config = auto_ml_config,
                       job_config = job_config,
                       data_persister = data_persister)

[ ]:

def my_function(u):
    if u==0:
        job_controller.run()
    if u==1:
        job_runner.run()

Carefull : this will stare 2 deamon thread that won’t stop until you stop them¶

from multiprocessing.dummy import Pool as ThreadPool pool = ThreadPool(2) results = pool.map(my_function, [0,1])