[1]:
%load_ext autoreload
%autoreload 2
[26]:
import warnings
warnings.filterwarnings('ignore') # to remove gensim warning

Auto clustering

This notebook will explain the auto-clustering capabilities of aikit.

It shows the several things involved. If you just want to run it you should use the automl launcher

Custom random search

[3]:
import pandas as pd
from sklearn.datasets import load_iris
[4]:
iris = load_iris()
[5]:
X = iris.data
y = iris.target
[6]:
X = pd.DataFrame(X, columns=iris.feature_names)
X.head()
[6]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
[7]:
y = pd.DataFrame(y, columns=['label'])
[9]:
from aikit.ml_machine import AutoMlConfig, JobConfig,  MlJobManager, MlJobRunner, AutoMlResultReader
from aikit.ml_machine import FolderDataPersister, SavingType, AutoMlModelGuider

AutoML configuration object

This object will contain all the relevant information about the problem at hand : * it’s type : REGRESSION or CLASSIFICATION * the information about the column in the data * the steps that are needed in the processing pipeline (see explanation after) * the models that are to be tested * …

By default the model will guess everything but everything can be changed if needed

If y is set to None, it will guess that it is a clustering problem.

[10]:
from aikit.ml_machine.ml_machine_registration import MODEL_REGISTER
[11]:
auto_ml_config = AutoMlConfig(dfX=X, y=None, name = "iris")
auto_ml_config.guess_everything()
[11]:
<aikit.ml_machine.ml_machine.AutoMlConfig object at 0x12eb44630>
type of problem : CLUSTERING
[12]:
auto_ml_config.type_of_problem
[12]:
'CLUSTERING'
[13]:
pd.DataFrame(auto_ml_config.columns_informations).T
[13]:
HasMissing ToKeep TypeOfVariable
sepal length (cm) False True NUM
sepal width (cm) False True NUM
petal length (cm) False True NUM
petal width (cm) False True NUM
[14]:
auto_ml_config.needed_steps
[14]:
[{'step': 'Scaling', 'optional': True},
 {'step': 'DimensionReduction', 'optional': True},
 {'step': 'FeatureExtraction', 'optional': True},
 {'step': 'FeatureSelection', 'optional': True},
 {'step': 'Model', 'optional': False}]
[15]:
auto_ml_config.models_to_keep
[15]:
[('TextEncoder', 'CountVectorizerWrapper'),
 ('TextEncoder', 'Word2VecVectorizer'),
 ('TextEncoder', 'Char2VecVectorizer'),
 ('TextPreprocessing', 'TextNltkProcessing'),
 ('TextPreprocessing', 'TextDefaultProcessing'),
 ('TextPreprocessing', 'TextDigitAnonymizer'),
 ('CategoryEncoder', 'NumericalEncoder'),
 ('MissingValueImputer', 'NumImputer'),
 ('DimensionReduction', 'TruncatedSVDWrapper'),
 ('DimensionReduction', 'PCAWrapper'),
 ('TextDimensionReduction', 'TruncatedSVDWrapper'),
 ('DimensionReduction', 'KMeansTransformer'),
 ('Scaling', 'CdfScaler'),
 ('Model', 'KMeansWrapper'),
 ('Model', 'AgglomerativeClusteringWrapper'),
 ('Model', 'DBSCANWrapper')]

Manual clustering pipeline

[16]:
from aikit.pipeline import GraphPipeline
[17]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
[18]:
gpipeline = GraphPipeline(models={"scaler": StandardScaler(),
                                  "kmeans": KMeans(n_clusters=3)},
                         edges=[("scaler", "kmeans")])

gpipeline.fit(X)
[18]:
GraphPipeline(edges=[('scaler', 'kmeans')],
              models={'kmeans': KMeans(algorithm='auto', copy_x=True,
                                       init='k-means++', max_iter=300,
                                       n_clusters=3, n_init=10, n_jobs=None,
                                       precompute_distances='auto',
                                       random_state=None, tol=0.0001,
                                       verbose=0),
                      'scaler': StandardScaler(copy=True, with_mean=True,
                                               with_std=True)},
              no_concat_nodes=None, verbose=False)
[19]:
labels = gpipeline.predict(X)
[20]:
labels
[20]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2], dtype=int32)
[27]:
from aikit.cross_validation import score_from_params_clustering
cv_result = score_from_params_clustering(gpipeline, X,
                             y=None,
                             scoring=["silhouette", 'calinski_harabaz'],
                             verbose=1)
cv_result
[27]:
test_silhouette test_calinski_harabaz fit_time score_time
0 0.506153 505.957631 0.016606 0.002193
[28]:
cv_result = score_from_params_clustering(KMeans(n_clusters=3), X,
                             y=None,
                             scoring=["silhouette", 'calinski_harabaz'],
                             verbose=1)
cv_result
[28]:
test_silhouette test_calinski_harabaz fit_time score_time
0 0.552819 561.627757 0.017693 0.002682

Auto-ML

[ ]:
job_config = JobConfig()
job_config.guess_scoring(auto_ml_config = auto_ml_config)

job_config.score_base_line = None
[ ]:
job_config.scoring
[ ]:
base_folder = # INSERT PATH HERE
data_persister = FolderDataPersister(base_folder = base_folder)
[ ]:
result_reader = AutoMlResultReader(data_persister)
auto_ml_guider = AutoMlModelGuider(result_reader = result_reader,
                                       job_config = job_config,
                                       metric_transformation="default",
                                       avg_metric=True
                                       )

job_controller = MlJobManager(auto_ml_config = auto_ml_config,
                                job_config = job_config,
                                auto_ml_guider = auto_ml_guider,
                                data_persister = data_persister)
[ ]:
job_runner = MlJobRunner(dfX = X ,
                       y = None,
                       groups = None,
                       auto_ml_config = auto_ml_config,
                       job_config = job_config,
                       data_persister = data_persister)
[ ]:
def my_function(u):
    if u==0:
        job_controller.run()
    if u==1:
        job_runner.run()

Carefull : this will stare 2 deamon thread that won’t stop until you stop them

from multiprocessing.dummy import Pool as ThreadPool pool = ThreadPool(2) results = pool.map(my_function, [0,1])