[1]:
%load_ext autoreload
%autoreload 2
[26]:
import warnings
warnings.filterwarnings('ignore') # to remove gensim warning
Auto clustering¶
This notebook will explain the auto-clustering capabilities of aikit.
It shows the several things involved. If you just want to run it you should use the automl launcher
Custom random search
[3]:
import pandas as pd
from sklearn.datasets import load_iris
[4]:
iris = load_iris()
[5]:
X = iris.data
y = iris.target
[6]:
X = pd.DataFrame(X, columns=iris.feature_names)
X.head()
[6]:
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
[7]:
y = pd.DataFrame(y, columns=['label'])
[9]:
from aikit.ml_machine import AutoMlConfig, JobConfig, MlJobManager, MlJobRunner, AutoMlResultReader
from aikit.ml_machine import FolderDataPersister, SavingType, AutoMlModelGuider
AutoML configuration object¶
This object will contain all the relevant information about the problem at hand : * it’s type : REGRESSION or CLASSIFICATION * the information about the column in the data * the steps that are needed in the processing pipeline (see explanation after) * the models that are to be tested * …
By default the model will guess everything but everything can be changed if needed
If y is set to None
, it will guess that it is a clustering problem.
[10]:
from aikit.ml_machine.ml_machine_registration import MODEL_REGISTER
[11]:
auto_ml_config = AutoMlConfig(dfX=X, y=None, name = "iris")
auto_ml_config.guess_everything()
[11]:
<aikit.ml_machine.ml_machine.AutoMlConfig object at 0x12eb44630>
type of problem : CLUSTERING
[12]:
auto_ml_config.type_of_problem
[12]:
'CLUSTERING'
[13]:
pd.DataFrame(auto_ml_config.columns_informations).T
[13]:
HasMissing | ToKeep | TypeOfVariable | |
---|---|---|---|
sepal length (cm) | False | True | NUM |
sepal width (cm) | False | True | NUM |
petal length (cm) | False | True | NUM |
petal width (cm) | False | True | NUM |
[14]:
auto_ml_config.needed_steps
[14]:
[{'step': 'Scaling', 'optional': True},
{'step': 'DimensionReduction', 'optional': True},
{'step': 'FeatureExtraction', 'optional': True},
{'step': 'FeatureSelection', 'optional': True},
{'step': 'Model', 'optional': False}]
[15]:
auto_ml_config.models_to_keep
[15]:
[('TextEncoder', 'CountVectorizerWrapper'),
('TextEncoder', 'Word2VecVectorizer'),
('TextEncoder', 'Char2VecVectorizer'),
('TextPreprocessing', 'TextNltkProcessing'),
('TextPreprocessing', 'TextDefaultProcessing'),
('TextPreprocessing', 'TextDigitAnonymizer'),
('CategoryEncoder', 'NumericalEncoder'),
('MissingValueImputer', 'NumImputer'),
('DimensionReduction', 'TruncatedSVDWrapper'),
('DimensionReduction', 'PCAWrapper'),
('TextDimensionReduction', 'TruncatedSVDWrapper'),
('DimensionReduction', 'KMeansTransformer'),
('Scaling', 'CdfScaler'),
('Model', 'KMeansWrapper'),
('Model', 'AgglomerativeClusteringWrapper'),
('Model', 'DBSCANWrapper')]
Manual clustering pipeline¶
[16]:
from aikit.pipeline import GraphPipeline
[17]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
[18]:
gpipeline = GraphPipeline(models={"scaler": StandardScaler(),
"kmeans": KMeans(n_clusters=3)},
edges=[("scaler", "kmeans")])
gpipeline.fit(X)
[18]:
GraphPipeline(edges=[('scaler', 'kmeans')],
models={'kmeans': KMeans(algorithm='auto', copy_x=True,
init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=None,
precompute_distances='auto',
random_state=None, tol=0.0001,
verbose=0),
'scaler': StandardScaler(copy=True, with_mean=True,
with_std=True)},
no_concat_nodes=None, verbose=False)
[19]:
labels = gpipeline.predict(X)
[20]:
labels
[20]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,
2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2], dtype=int32)
[27]:
from aikit.cross_validation import score_from_params_clustering
cv_result = score_from_params_clustering(gpipeline, X,
y=None,
scoring=["silhouette", 'calinski_harabaz'],
verbose=1)
cv_result
[27]:
test_silhouette | test_calinski_harabaz | fit_time | score_time | |
---|---|---|---|---|
0 | 0.506153 | 505.957631 | 0.016606 | 0.002193 |
[28]:
cv_result = score_from_params_clustering(KMeans(n_clusters=3), X,
y=None,
scoring=["silhouette", 'calinski_harabaz'],
verbose=1)
cv_result
[28]:
test_silhouette | test_calinski_harabaz | fit_time | score_time | |
---|---|---|---|---|
0 | 0.552819 | 561.627757 | 0.017693 | 0.002682 |
Auto-ML¶
[ ]:
job_config = JobConfig()
job_config.guess_scoring(auto_ml_config = auto_ml_config)
job_config.score_base_line = None
[ ]:
job_config.scoring
[ ]:
base_folder = # INSERT PATH HERE
data_persister = FolderDataPersister(base_folder = base_folder)
[ ]:
result_reader = AutoMlResultReader(data_persister)
auto_ml_guider = AutoMlModelGuider(result_reader = result_reader,
job_config = job_config,
metric_transformation="default",
avg_metric=True
)
job_controller = MlJobManager(auto_ml_config = auto_ml_config,
job_config = job_config,
auto_ml_guider = auto_ml_guider,
data_persister = data_persister)
[ ]:
job_runner = MlJobRunner(dfX = X ,
y = None,
groups = None,
auto_ml_config = auto_ml_config,
job_config = job_config,
data_persister = data_persister)
[ ]:
def my_function(u):
if u==0:
job_controller.run()
if u==1:
job_runner.run()
Carefull : this will stare 2 deamon thread that won’t stop until you stop them¶
from multiprocessing.dummy import Pool as ThreadPool pool = ThreadPool(2) results = pool.map(my_function, [0,1])