Transformers

[1]:
import warnings
warnings.filterwarnings('ignore')
[2]:
from aikit.datasets.datasets import load_dataset,DatasetEnum
Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)
[3]:
Xtrain.head(20)
[3]:
pclass name sex age sibsp parch ticket fare cabin embarked boat body home_dest
0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S NaN 175.0 Dorchester, MA
1 1 Fortune, Mr. Mark male 64.0 1 4 19950 263.0000 C23 C25 C27 S NaN NaN Winnipeg, MB
2 1 Sagesser, Mlle. Emma female 24.0 0 0 PC 17477 69.3000 B35 C 9 NaN NaN
3 3 Panula, Master. Urho Abraham male 2.0 4 1 3101295 39.6875 NaN S NaN NaN NaN
4 1 Maioni, Miss. Roberta female 16.0 0 0 110152 86.5000 B79 S 8 NaN NaN
5 3 Waelens, Mr. Achille male 22.0 0 0 345767 9.0000 NaN S NaN NaN Antwerp, Belgium / Stanton, OH
6 3 Reed, Mr. James George male NaN 0 0 362316 7.2500 NaN S NaN NaN NaN
7 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S 8 NaN Brooklyn, NY
8 1 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) female 18.0 1 0 13695 60.0000 C31 S 6 NaN Huntington, WV
9 1 Rowe, Mr. Alfred G male 33.0 0 0 113790 26.5500 NaN S NaN 109.0 London
10 3 Meo, Mr. Alfonzo male 55.5 0 0 A.5. 11206 8.0500 NaN S NaN 201.0 NaN
11 3 Abbott, Mr. Rossmore Edward male 16.0 1 1 C.A. 2673 20.2500 NaN S NaN 190.0 East Providence, RI
12 3 Elias, Mr. Dibo male NaN 0 0 2674 7.2250 NaN C NaN NaN NaN
13 2 Reynaldo, Ms. Encarnacion female 28.0 0 0 230434 13.0000 NaN S 9 NaN Spain
14 3 Khalil, Mr. Betros male NaN 1 0 2660 14.4542 NaN C NaN NaN NaN
15 1 Daniels, Miss. Sarah female 33.0 0 0 113781 151.5500 NaN S 8 NaN NaN
16 3 Ford, Miss. Robina Maggie 'Ruby' female 9.0 2 2 W./C. 6608 34.3750 NaN S NaN NaN Rotherfield, Sussex, England Essex Co, MA
17 3 Thorneycroft, Mrs. Percival (Florence Kate White) female NaN 1 0 376564 16.1000 NaN S 10 NaN NaN
18 3 Lennon, Mr. Denis male NaN 1 0 370371 15.5000 NaN Q NaN NaN NaN
19 3 de Pelsmaeker, Mr. Alfons male 16.0 0 0 345778 9.5000 NaN S NaN NaN NaN
[4]:
columns_block = {"CAT":["sex","embarked","cabin"],
                 "NUM":["pclass","age","sibsp","parch","fare"],
                 "TEXT":["name","ticket"]}
columns_block
[4]:
{'CAT': ['sex', 'embarked', 'cabin'],
 'NUM': ['pclass', 'age', 'sibsp', 'parch', 'fare'],
 'TEXT': ['name', 'ticket']}

Numerical Encoder

[5]:
from aikit.transformers import NumericalEncoder

encoder = NumericalEncoder(columns_to_use=columns_block["CAT"] + columns_block["NUM"])
Xtrain_cat = encoder.fit_transform(Xtrain)
[6]:
Xtrain_cat.head()
[6]:
name ticket boat body home_dest sex__male sex__female embarked__S embarked__C embarked__Q ... fare__7.925 fare__7.225 fare__7.25 fare__8.6625 fare__0.0 fare__69.55 fare__15.5 fare__7.8542 fare__21.0 fare____default__
0 McCarthy, Mr. Timothy J 17463 NaN 175.0 Dorchester, MA 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 Fortune, Mr. Mark 19950 NaN NaN Winnipeg, MB 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
2 Sagesser, Mlle. Emma PC 17477 9 NaN NaN 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
3 Panula, Master. Urho Abraham 3101295 NaN NaN NaN 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 Maioni, Miss. Roberta 110152 8 NaN NaN 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 80 columns

[7]:
list(Xtrain_cat.columns)
[7]:
['name',
 'ticket',
 'boat',
 'body',
 'home_dest',
 'sex__male',
 'sex__female',
 'embarked__S',
 'embarked__C',
 'embarked__Q',
 'cabin____null__',
 'cabin____default__',
 'pclass__3',
 'pclass__1',
 'pclass__2',
 'age____null__',
 'age__24.0',
 'age__18.0',
 'age__22.0',
 'age__21.0',
 'age__30.0',
 'age__36.0',
 'age__28.0',
 'age__19.0',
 'age__27.0',
 'age__25.0',
 'age__29.0',
 'age__23.0',
 'age__31.0',
 'age__26.0',
 'age__35.0',
 'age__32.0',
 'age__33.0',
 'age__39.0',
 'age__17.0',
 'age__42.0',
 'age__45.0',
 'age__16.0',
 'age__20.0',
 'age__50.0',
 'age__40.0',
 'age__34.0',
 'age__38.0',
 'age__1.0',
 'age__47.0',
 'age____default__',
 'sibsp__0',
 'sibsp__1',
 'sibsp__2',
 'sibsp__4',
 'sibsp__3',
 'sibsp__8',
 'sibsp__5',
 'parch__0',
 'parch__1',
 'parch__2',
 'parch__5',
 'parch__4',
 'parch__3',
 'parch__9',
 'parch__6',
 'fare__13.0',
 'fare__7.75',
 'fare__8.05',
 'fare__7.8958',
 'fare__26.0',
 'fare__10.5',
 'fare__7.775',
 'fare__7.2292',
 'fare__26.55',
 'fare__7.925',
 'fare__7.225',
 'fare__7.25',
 'fare__8.6625',
 'fare__0.0',
 'fare__69.55',
 'fare__15.5',
 'fare__7.8542',
 'fare__21.0',
 'fare____default__']

Target Encoder

[8]:
from aikit.transformers import TargetEncoderClassifier

encoder = TargetEncoderClassifier(columns_to_use=columns_block["CAT"] + columns_block["NUM"])
Xtrain_cat = encoder.fit_transform(Xtrain, y_train)

Xtrain_cat.head()
[8]:
name ticket boat body home_dest sex__target_1 embarked__target_1 cabin__target_1 pclass__target_1 age__target_1 sibsp__target_1 parch__target_1 fare__target_1
0 McCarthy, Mr. Timothy J 17463 NaN 175.0 Dorchester, MA 0.184564 0.341945 0.195652 0.612766 0.547457 0.353941 0.328632 0.185878
1 Fortune, Mr. Mark 19950 NaN NaN Winnipeg, MB 0.184564 0.341945 0.597354 0.612766 0.453744 0.524017 0.276773 0.597354
2 Sagesser, Mlle. Emma PC 17477 9 NaN NaN 0.746398 0.576720 0.695652 0.612766 0.481185 0.353941 0.328632 0.695652
3 Panula, Master. Urho Abraham 3101295 NaN NaN NaN 0.184564 0.341945 0.314483 0.264822 0.463933 0.267929 0.653542 0.166522
4 Maioni, Miss. Roberta 110152 8 NaN NaN 0.746398 0.341945 0.391304 0.612766 0.427970 0.353941 0.328632 0.710857

CountVectorizer

[9]:
from aikit.transformers import CountVectorizerWrapper

encoder = CountVectorizerWrapper(analyzer="char",columns_to_use=columns_block["TEXT"])

Xtrain_enc = encoder.fit_transform(Xtrain)
Xtrain_enc
[9]:
<1048x62 sparse matrix of type '<class 'numpy.int32'>'
        with 21936 stored elements in COOrdinate format>
[10]:
encoder.get_feature_names()
[10]:
['name__BAG__ ',
 "name__BAG__'",
 'name__BAG__(',
 'name__BAG__)',
 'name__BAG__,',
 'name__BAG__-',
 'name__BAG__.',
 'name__BAG__/',
 'name__BAG__a',
 'name__BAG__b',
 'name__BAG__c',
 'name__BAG__d',
 'name__BAG__e',
 'name__BAG__f',
 'name__BAG__g',
 'name__BAG__h',
 'name__BAG__i',
 'name__BAG__j',
 'name__BAG__k',
 'name__BAG__l',
 'name__BAG__m',
 'name__BAG__n',
 'name__BAG__o',
 'name__BAG__p',
 'name__BAG__q',
 'name__BAG__r',
 'name__BAG__s',
 'name__BAG__t',
 'name__BAG__u',
 'name__BAG__v',
 'name__BAG__w',
 'name__BAG__x',
 'name__BAG__y',
 'name__BAG__z',
 'ticket__BAG__ ',
 'ticket__BAG__.',
 'ticket__BAG__/',
 'ticket__BAG__0',
 'ticket__BAG__1',
 'ticket__BAG__2',
 'ticket__BAG__3',
 'ticket__BAG__4',
 'ticket__BAG__5',
 'ticket__BAG__6',
 'ticket__BAG__7',
 'ticket__BAG__8',
 'ticket__BAG__9',
 'ticket__BAG__a',
 'ticket__BAG__c',
 'ticket__BAG__e',
 'ticket__BAG__f',
 'ticket__BAG__h',
 'ticket__BAG__i',
 'ticket__BAG__l',
 'ticket__BAG__n',
 'ticket__BAG__o',
 'ticket__BAG__p',
 'ticket__BAG__q',
 'ticket__BAG__r',
 'ticket__BAG__s',
 'ticket__BAG__t',
 'ticket__BAG__w']

Truncated SVD

[11]:
from aikit.transformers import TruncatedSVDWrapper
svd = TruncatedSVDWrapper(n_components=0.1)

xx_train_small_svd = svd.fit_transform(Xtrain_enc)
xx_train_small_svd
[11]:
SVD__0 SVD__1 SVD__2 SVD__3 SVD__4 SVD__5
0 5.087193 -0.825855 -1.777827 1.079631 1.131004 -1.943098
1 4.329638 -1.584227 -1.592165 0.356413 0.598414 -0.024379
2 5.899164 -0.197573 2.482450 -0.058293 -1.766613 -0.709567
3 7.419224 0.221492 -2.534401 2.470868 -1.714951 0.337166
4 5.771280 1.272598 -0.836011 0.126549 1.036792 -0.689974
... ... ... ... ... ... ...
1043 7.751914 -0.199622 0.934052 -0.541200 -1.632366 -0.380562
1044 7.063821 -1.730135 -1.130117 -2.086049 0.066611 -0.866250
1045 5.595593 1.029825 1.534740 1.149602 1.487406 1.612092
1046 4.625180 -1.261425 -0.873728 -0.566604 1.032036 0.202452
1047 5.020200 1.633941 -2.131337 -0.567731 0.226184 -1.410670

1048 rows × 6 columns

Column selector

[12]:
from aikit.transformers import ColumnsSelector
selector = ColumnsSelector(columns_to_use=columns_block["TEXT"])
Xtrain_subset = selector.fit_transform(Xtrain)
Xtrain_subset.head(10)
[12]:
name ticket
0 McCarthy, Mr. Timothy J 17463
1 Fortune, Mr. Mark 19950
2 Sagesser, Mlle. Emma PC 17477
3 Panula, Master. Urho Abraham 3101295
4 Maioni, Miss. Roberta 110152
5 Waelens, Mr. Achille 345767
6 Reed, Mr. James George 362316
7 Swift, Mrs. Frederick Joel (Margaret Welles Ba... 17466
8 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) 13695
9 Rowe, Mr. Alfred G 113790
[ ]: