Transformers¶
[1]:
import warnings
warnings.filterwarnings('ignore')
[2]:
from aikit.datasets.datasets import load_dataset,DatasetEnum
Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)
[3]:
Xtrain.head(20)
[3]:
pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home_dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S | NaN | 175.0 | Dorchester, MA |
1 | 1 | Fortune, Mr. Mark | male | 64.0 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S | NaN | NaN | Winnipeg, MB |
2 | 1 | Sagesser, Mlle. Emma | female | 24.0 | 0 | 0 | PC 17477 | 69.3000 | B35 | C | 9 | NaN | NaN |
3 | 3 | Panula, Master. Urho Abraham | male | 2.0 | 4 | 1 | 3101295 | 39.6875 | NaN | S | NaN | NaN | NaN |
4 | 1 | Maioni, Miss. Roberta | female | 16.0 | 0 | 0 | 110152 | 86.5000 | B79 | S | 8 | NaN | NaN |
5 | 3 | Waelens, Mr. Achille | male | 22.0 | 0 | 0 | 345767 | 9.0000 | NaN | S | NaN | NaN | Antwerp, Belgium / Stanton, OH |
6 | 3 | Reed, Mr. James George | male | NaN | 0 | 0 | 362316 | 7.2500 | NaN | S | NaN | NaN | NaN |
7 | 1 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | female | 48.0 | 0 | 0 | 17466 | 25.9292 | D17 | S | 8 | NaN | Brooklyn, NY |
8 | 1 | Smith, Mrs. Lucien Philip (Mary Eloise Hughes) | female | 18.0 | 1 | 0 | 13695 | 60.0000 | C31 | S | 6 | NaN | Huntington, WV |
9 | 1 | Rowe, Mr. Alfred G | male | 33.0 | 0 | 0 | 113790 | 26.5500 | NaN | S | NaN | 109.0 | London |
10 | 3 | Meo, Mr. Alfonzo | male | 55.5 | 0 | 0 | A.5. 11206 | 8.0500 | NaN | S | NaN | 201.0 | NaN |
11 | 3 | Abbott, Mr. Rossmore Edward | male | 16.0 | 1 | 1 | C.A. 2673 | 20.2500 | NaN | S | NaN | 190.0 | East Providence, RI |
12 | 3 | Elias, Mr. Dibo | male | NaN | 0 | 0 | 2674 | 7.2250 | NaN | C | NaN | NaN | NaN |
13 | 2 | Reynaldo, Ms. Encarnacion | female | 28.0 | 0 | 0 | 230434 | 13.0000 | NaN | S | 9 | NaN | Spain |
14 | 3 | Khalil, Mr. Betros | male | NaN | 1 | 0 | 2660 | 14.4542 | NaN | C | NaN | NaN | NaN |
15 | 1 | Daniels, Miss. Sarah | female | 33.0 | 0 | 0 | 113781 | 151.5500 | NaN | S | 8 | NaN | NaN |
16 | 3 | Ford, Miss. Robina Maggie 'Ruby' | female | 9.0 | 2 | 2 | W./C. 6608 | 34.3750 | NaN | S | NaN | NaN | Rotherfield, Sussex, England Essex Co, MA |
17 | 3 | Thorneycroft, Mrs. Percival (Florence Kate White) | female | NaN | 1 | 0 | 376564 | 16.1000 | NaN | S | 10 | NaN | NaN |
18 | 3 | Lennon, Mr. Denis | male | NaN | 1 | 0 | 370371 | 15.5000 | NaN | Q | NaN | NaN | NaN |
19 | 3 | de Pelsmaeker, Mr. Alfons | male | 16.0 | 0 | 0 | 345778 | 9.5000 | NaN | S | NaN | NaN | NaN |
[4]:
columns_block = {"CAT":["sex","embarked","cabin"],
"NUM":["pclass","age","sibsp","parch","fare"],
"TEXT":["name","ticket"]}
columns_block
[4]:
{'CAT': ['sex', 'embarked', 'cabin'],
'NUM': ['pclass', 'age', 'sibsp', 'parch', 'fare'],
'TEXT': ['name', 'ticket']}
Numerical Encoder¶
[5]:
from aikit.transformers import NumericalEncoder
encoder = NumericalEncoder(columns_to_use=columns_block["CAT"] + columns_block["NUM"])
Xtrain_cat = encoder.fit_transform(Xtrain)
[6]:
Xtrain_cat.head()
[6]:
name | ticket | boat | body | home_dest | sex__male | sex__female | embarked__S | embarked__C | embarked__Q | ... | fare__7.925 | fare__7.225 | fare__7.25 | fare__8.6625 | fare__0.0 | fare__69.55 | fare__15.5 | fare__7.8542 | fare__21.0 | fare____default__ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | McCarthy, Mr. Timothy J | 17463 | NaN | 175.0 | Dorchester, MA | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | Fortune, Mr. Mark | 19950 | NaN | NaN | Winnipeg, MB | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | Sagesser, Mlle. Emma | PC 17477 | 9 | NaN | NaN | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | Panula, Master. Urho Abraham | 3101295 | NaN | NaN | NaN | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | Maioni, Miss. Roberta | 110152 | 8 | NaN | NaN | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 80 columns
[7]:
list(Xtrain_cat.columns)
[7]:
['name',
'ticket',
'boat',
'body',
'home_dest',
'sex__male',
'sex__female',
'embarked__S',
'embarked__C',
'embarked__Q',
'cabin____null__',
'cabin____default__',
'pclass__3',
'pclass__1',
'pclass__2',
'age____null__',
'age__24.0',
'age__18.0',
'age__22.0',
'age__21.0',
'age__30.0',
'age__36.0',
'age__28.0',
'age__19.0',
'age__27.0',
'age__25.0',
'age__29.0',
'age__23.0',
'age__31.0',
'age__26.0',
'age__35.0',
'age__32.0',
'age__33.0',
'age__39.0',
'age__17.0',
'age__42.0',
'age__45.0',
'age__16.0',
'age__20.0',
'age__50.0',
'age__40.0',
'age__34.0',
'age__38.0',
'age__1.0',
'age__47.0',
'age____default__',
'sibsp__0',
'sibsp__1',
'sibsp__2',
'sibsp__4',
'sibsp__3',
'sibsp__8',
'sibsp__5',
'parch__0',
'parch__1',
'parch__2',
'parch__5',
'parch__4',
'parch__3',
'parch__9',
'parch__6',
'fare__13.0',
'fare__7.75',
'fare__8.05',
'fare__7.8958',
'fare__26.0',
'fare__10.5',
'fare__7.775',
'fare__7.2292',
'fare__26.55',
'fare__7.925',
'fare__7.225',
'fare__7.25',
'fare__8.6625',
'fare__0.0',
'fare__69.55',
'fare__15.5',
'fare__7.8542',
'fare__21.0',
'fare____default__']
Target Encoder¶
[8]:
from aikit.transformers import TargetEncoderClassifier
encoder = TargetEncoderClassifier(columns_to_use=columns_block["CAT"] + columns_block["NUM"])
Xtrain_cat = encoder.fit_transform(Xtrain, y_train)
Xtrain_cat.head()
[8]:
name | ticket | boat | body | home_dest | sex__target_1 | embarked__target_1 | cabin__target_1 | pclass__target_1 | age__target_1 | sibsp__target_1 | parch__target_1 | fare__target_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | McCarthy, Mr. Timothy J | 17463 | NaN | 175.0 | Dorchester, MA | 0.184564 | 0.341945 | 0.195652 | 0.612766 | 0.547457 | 0.353941 | 0.328632 | 0.185878 |
1 | Fortune, Mr. Mark | 19950 | NaN | NaN | Winnipeg, MB | 0.184564 | 0.341945 | 0.597354 | 0.612766 | 0.453744 | 0.524017 | 0.276773 | 0.597354 |
2 | Sagesser, Mlle. Emma | PC 17477 | 9 | NaN | NaN | 0.746398 | 0.576720 | 0.695652 | 0.612766 | 0.481185 | 0.353941 | 0.328632 | 0.695652 |
3 | Panula, Master. Urho Abraham | 3101295 | NaN | NaN | NaN | 0.184564 | 0.341945 | 0.314483 | 0.264822 | 0.463933 | 0.267929 | 0.653542 | 0.166522 |
4 | Maioni, Miss. Roberta | 110152 | 8 | NaN | NaN | 0.746398 | 0.341945 | 0.391304 | 0.612766 | 0.427970 | 0.353941 | 0.328632 | 0.710857 |
CountVectorizer¶
[9]:
from aikit.transformers import CountVectorizerWrapper
encoder = CountVectorizerWrapper(analyzer="char",columns_to_use=columns_block["TEXT"])
Xtrain_enc = encoder.fit_transform(Xtrain)
Xtrain_enc
[9]:
<1048x62 sparse matrix of type '<class 'numpy.int32'>'
with 21936 stored elements in COOrdinate format>
[10]:
encoder.get_feature_names()
[10]:
['name__BAG__ ',
"name__BAG__'",
'name__BAG__(',
'name__BAG__)',
'name__BAG__,',
'name__BAG__-',
'name__BAG__.',
'name__BAG__/',
'name__BAG__a',
'name__BAG__b',
'name__BAG__c',
'name__BAG__d',
'name__BAG__e',
'name__BAG__f',
'name__BAG__g',
'name__BAG__h',
'name__BAG__i',
'name__BAG__j',
'name__BAG__k',
'name__BAG__l',
'name__BAG__m',
'name__BAG__n',
'name__BAG__o',
'name__BAG__p',
'name__BAG__q',
'name__BAG__r',
'name__BAG__s',
'name__BAG__t',
'name__BAG__u',
'name__BAG__v',
'name__BAG__w',
'name__BAG__x',
'name__BAG__y',
'name__BAG__z',
'ticket__BAG__ ',
'ticket__BAG__.',
'ticket__BAG__/',
'ticket__BAG__0',
'ticket__BAG__1',
'ticket__BAG__2',
'ticket__BAG__3',
'ticket__BAG__4',
'ticket__BAG__5',
'ticket__BAG__6',
'ticket__BAG__7',
'ticket__BAG__8',
'ticket__BAG__9',
'ticket__BAG__a',
'ticket__BAG__c',
'ticket__BAG__e',
'ticket__BAG__f',
'ticket__BAG__h',
'ticket__BAG__i',
'ticket__BAG__l',
'ticket__BAG__n',
'ticket__BAG__o',
'ticket__BAG__p',
'ticket__BAG__q',
'ticket__BAG__r',
'ticket__BAG__s',
'ticket__BAG__t',
'ticket__BAG__w']
Truncated SVD¶
[11]:
from aikit.transformers import TruncatedSVDWrapper
svd = TruncatedSVDWrapper(n_components=0.1)
xx_train_small_svd = svd.fit_transform(Xtrain_enc)
xx_train_small_svd
[11]:
SVD__0 | SVD__1 | SVD__2 | SVD__3 | SVD__4 | SVD__5 | |
---|---|---|---|---|---|---|
0 | 5.087193 | -0.825855 | -1.777827 | 1.079631 | 1.131004 | -1.943098 |
1 | 4.329638 | -1.584227 | -1.592165 | 0.356413 | 0.598414 | -0.024379 |
2 | 5.899164 | -0.197573 | 2.482450 | -0.058293 | -1.766613 | -0.709567 |
3 | 7.419224 | 0.221492 | -2.534401 | 2.470868 | -1.714951 | 0.337166 |
4 | 5.771280 | 1.272598 | -0.836011 | 0.126549 | 1.036792 | -0.689974 |
... | ... | ... | ... | ... | ... | ... |
1043 | 7.751914 | -0.199622 | 0.934052 | -0.541200 | -1.632366 | -0.380562 |
1044 | 7.063821 | -1.730135 | -1.130117 | -2.086049 | 0.066611 | -0.866250 |
1045 | 5.595593 | 1.029825 | 1.534740 | 1.149602 | 1.487406 | 1.612092 |
1046 | 4.625180 | -1.261425 | -0.873728 | -0.566604 | 1.032036 | 0.202452 |
1047 | 5.020200 | 1.633941 | -2.131337 | -0.567731 | 0.226184 | -1.410670 |
1048 rows × 6 columns
Column selector¶
[12]:
from aikit.transformers import ColumnsSelector
selector = ColumnsSelector(columns_to_use=columns_block["TEXT"])
Xtrain_subset = selector.fit_transform(Xtrain)
Xtrain_subset.head(10)
[12]:
name | ticket | |
---|---|---|
0 | McCarthy, Mr. Timothy J | 17463 |
1 | Fortune, Mr. Mark | 19950 |
2 | Sagesser, Mlle. Emma | PC 17477 |
3 | Panula, Master. Urho Abraham | 3101295 |
4 | Maioni, Miss. Roberta | 110152 |
5 | Waelens, Mr. Achille | 345767 |
6 | Reed, Mr. James George | 362316 |
7 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | 17466 |
8 | Smith, Mrs. Lucien Philip (Mary Eloise Hughes) | 13695 |
9 | Rowe, Mr. Alfred G | 113790 |
[ ]: