{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Transformers"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from aikit.datasets.datasets import load_dataset,DatasetEnum\n",
"Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home_dest | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" McCarthy, Mr. Timothy J | \n",
" male | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" 17463 | \n",
" 51.8625 | \n",
" E46 | \n",
" S | \n",
" NaN | \n",
" 175.0 | \n",
" Dorchester, MA | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" male | \n",
" 64.0 | \n",
" 1 | \n",
" 4 | \n",
" 19950 | \n",
" 263.0000 | \n",
" C23 C25 C27 | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Winnipeg, MB | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" Sagesser, Mlle. Emma | \n",
" female | \n",
" 24.0 | \n",
" 0 | \n",
" 0 | \n",
" PC 17477 | \n",
" 69.3000 | \n",
" B35 | \n",
" C | \n",
" 9 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" male | \n",
" 2.0 | \n",
" 4 | \n",
" 1 | \n",
" 3101295 | \n",
" 39.6875 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" Maioni, Miss. Roberta | \n",
" female | \n",
" 16.0 | \n",
" 0 | \n",
" 0 | \n",
" 110152 | \n",
" 86.5000 | \n",
" B79 | \n",
" S | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" 3 | \n",
" Waelens, Mr. Achille | \n",
" male | \n",
" 22.0 | \n",
" 0 | \n",
" 0 | \n",
" 345767 | \n",
" 9.0000 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Antwerp, Belgium / Stanton, OH | \n",
"
\n",
" \n",
" 6 | \n",
" 3 | \n",
" Reed, Mr. James George | \n",
" male | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 362316 | \n",
" 7.2500 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 7 | \n",
" 1 | \n",
" Swift, Mrs. Frederick Joel (Margaret Welles Ba... | \n",
" female | \n",
" 48.0 | \n",
" 0 | \n",
" 0 | \n",
" 17466 | \n",
" 25.9292 | \n",
" D17 | \n",
" S | \n",
" 8 | \n",
" NaN | \n",
" Brooklyn, NY | \n",
"
\n",
" \n",
" 8 | \n",
" 1 | \n",
" Smith, Mrs. Lucien Philip (Mary Eloise Hughes) | \n",
" female | \n",
" 18.0 | \n",
" 1 | \n",
" 0 | \n",
" 13695 | \n",
" 60.0000 | \n",
" C31 | \n",
" S | \n",
" 6 | \n",
" NaN | \n",
" Huntington, WV | \n",
"
\n",
" \n",
" 9 | \n",
" 1 | \n",
" Rowe, Mr. Alfred G | \n",
" male | \n",
" 33.0 | \n",
" 0 | \n",
" 0 | \n",
" 113790 | \n",
" 26.5500 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" 109.0 | \n",
" London | \n",
"
\n",
" \n",
" 10 | \n",
" 3 | \n",
" Meo, Mr. Alfonzo | \n",
" male | \n",
" 55.5 | \n",
" 0 | \n",
" 0 | \n",
" A.5. 11206 | \n",
" 8.0500 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" 201.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 11 | \n",
" 3 | \n",
" Abbott, Mr. Rossmore Edward | \n",
" male | \n",
" 16.0 | \n",
" 1 | \n",
" 1 | \n",
" C.A. 2673 | \n",
" 20.2500 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" 190.0 | \n",
" East Providence, RI | \n",
"
\n",
" \n",
" 12 | \n",
" 3 | \n",
" Elias, Mr. Dibo | \n",
" male | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 2674 | \n",
" 7.2250 | \n",
" NaN | \n",
" C | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 13 | \n",
" 2 | \n",
" Reynaldo, Ms. Encarnacion | \n",
" female | \n",
" 28.0 | \n",
" 0 | \n",
" 0 | \n",
" 230434 | \n",
" 13.0000 | \n",
" NaN | \n",
" S | \n",
" 9 | \n",
" NaN | \n",
" Spain | \n",
"
\n",
" \n",
" 14 | \n",
" 3 | \n",
" Khalil, Mr. Betros | \n",
" male | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 2660 | \n",
" 14.4542 | \n",
" NaN | \n",
" C | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 15 | \n",
" 1 | \n",
" Daniels, Miss. Sarah | \n",
" female | \n",
" 33.0 | \n",
" 0 | \n",
" 0 | \n",
" 113781 | \n",
" 151.5500 | \n",
" NaN | \n",
" S | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 16 | \n",
" 3 | \n",
" Ford, Miss. Robina Maggie 'Ruby' | \n",
" female | \n",
" 9.0 | \n",
" 2 | \n",
" 2 | \n",
" W./C. 6608 | \n",
" 34.3750 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Rotherfield, Sussex, England Essex Co, MA | \n",
"
\n",
" \n",
" 17 | \n",
" 3 | \n",
" Thorneycroft, Mrs. Percival (Florence Kate White) | \n",
" female | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 376564 | \n",
" 16.1000 | \n",
" NaN | \n",
" S | \n",
" 10 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 18 | \n",
" 3 | \n",
" Lennon, Mr. Denis | \n",
" male | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 370371 | \n",
" 15.5000 | \n",
" NaN | \n",
" Q | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 19 | \n",
" 3 | \n",
" de Pelsmaeker, Mr. Alfons | \n",
" male | \n",
" 16.0 | \n",
" 0 | \n",
" 0 | \n",
" 345778 | \n",
" 9.5000 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass name sex age \\\n",
"0 1 McCarthy, Mr. Timothy J male 54.0 \n",
"1 1 Fortune, Mr. Mark male 64.0 \n",
"2 1 Sagesser, Mlle. Emma female 24.0 \n",
"3 3 Panula, Master. Urho Abraham male 2.0 \n",
"4 1 Maioni, Miss. Roberta female 16.0 \n",
"5 3 Waelens, Mr. Achille male 22.0 \n",
"6 3 Reed, Mr. James George male NaN \n",
"7 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 \n",
"8 1 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) female 18.0 \n",
"9 1 Rowe, Mr. Alfred G male 33.0 \n",
"10 3 Meo, Mr. Alfonzo male 55.5 \n",
"11 3 Abbott, Mr. Rossmore Edward male 16.0 \n",
"12 3 Elias, Mr. Dibo male NaN \n",
"13 2 Reynaldo, Ms. Encarnacion female 28.0 \n",
"14 3 Khalil, Mr. Betros male NaN \n",
"15 1 Daniels, Miss. Sarah female 33.0 \n",
"16 3 Ford, Miss. Robina Maggie 'Ruby' female 9.0 \n",
"17 3 Thorneycroft, Mrs. Percival (Florence Kate White) female NaN \n",
"18 3 Lennon, Mr. Denis male NaN \n",
"19 3 de Pelsmaeker, Mr. Alfons male 16.0 \n",
"\n",
" sibsp parch ticket fare cabin embarked boat body \\\n",
"0 0 0 17463 51.8625 E46 S NaN 175.0 \n",
"1 1 4 19950 263.0000 C23 C25 C27 S NaN NaN \n",
"2 0 0 PC 17477 69.3000 B35 C 9 NaN \n",
"3 4 1 3101295 39.6875 NaN S NaN NaN \n",
"4 0 0 110152 86.5000 B79 S 8 NaN \n",
"5 0 0 345767 9.0000 NaN S NaN NaN \n",
"6 0 0 362316 7.2500 NaN S NaN NaN \n",
"7 0 0 17466 25.9292 D17 S 8 NaN \n",
"8 1 0 13695 60.0000 C31 S 6 NaN \n",
"9 0 0 113790 26.5500 NaN S NaN 109.0 \n",
"10 0 0 A.5. 11206 8.0500 NaN S NaN 201.0 \n",
"11 1 1 C.A. 2673 20.2500 NaN S NaN 190.0 \n",
"12 0 0 2674 7.2250 NaN C NaN NaN \n",
"13 0 0 230434 13.0000 NaN S 9 NaN \n",
"14 1 0 2660 14.4542 NaN C NaN NaN \n",
"15 0 0 113781 151.5500 NaN S 8 NaN \n",
"16 2 2 W./C. 6608 34.3750 NaN S NaN NaN \n",
"17 1 0 376564 16.1000 NaN S 10 NaN \n",
"18 1 0 370371 15.5000 NaN Q NaN NaN \n",
"19 0 0 345778 9.5000 NaN S NaN NaN \n",
"\n",
" home_dest \n",
"0 Dorchester, MA \n",
"1 Winnipeg, MB \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"5 Antwerp, Belgium / Stanton, OH \n",
"6 NaN \n",
"7 Brooklyn, NY \n",
"8 Huntington, WV \n",
"9 London \n",
"10 NaN \n",
"11 East Providence, RI \n",
"12 NaN \n",
"13 Spain \n",
"14 NaN \n",
"15 NaN \n",
"16 Rotherfield, Sussex, England Essex Co, MA \n",
"17 NaN \n",
"18 NaN \n",
"19 NaN "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtrain.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'CAT': ['sex', 'embarked', 'cabin'],\n",
" 'NUM': ['pclass', 'age', 'sibsp', 'parch', 'fare'],\n",
" 'TEXT': ['name', 'ticket']}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns_block = {\"CAT\":[\"sex\",\"embarked\",\"cabin\"],\n",
" \"NUM\":[\"pclass\",\"age\",\"sibsp\",\"parch\",\"fare\"],\n",
" \"TEXT\":[\"name\",\"ticket\"]}\n",
"columns_block"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Numerical Encoder"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from aikit.transformers import NumericalEncoder\n",
"\n",
"encoder = NumericalEncoder(columns_to_use=columns_block[\"CAT\"] + columns_block[\"NUM\"])\n",
"Xtrain_cat = encoder.fit_transform(Xtrain)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" ticket | \n",
" boat | \n",
" body | \n",
" home_dest | \n",
" sex__male | \n",
" sex__female | \n",
" embarked__S | \n",
" embarked__C | \n",
" embarked__Q | \n",
" ... | \n",
" fare__7.925 | \n",
" fare__7.225 | \n",
" fare__7.25 | \n",
" fare__8.6625 | \n",
" fare__0.0 | \n",
" fare__69.55 | \n",
" fare__15.5 | \n",
" fare__7.8542 | \n",
" fare__21.0 | \n",
" fare____default__ | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" McCarthy, Mr. Timothy J | \n",
" 17463 | \n",
" NaN | \n",
" 175.0 | \n",
" Dorchester, MA | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" 19950 | \n",
" NaN | \n",
" NaN | \n",
" Winnipeg, MB | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" Sagesser, Mlle. Emma | \n",
" PC 17477 | \n",
" 9 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" 3101295 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" Maioni, Miss. Roberta | \n",
" 110152 | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 80 columns
\n",
"
"
],
"text/plain": [
" name ticket boat body home_dest \\\n",
"0 McCarthy, Mr. Timothy J 17463 NaN 175.0 Dorchester, MA \n",
"1 Fortune, Mr. Mark 19950 NaN NaN Winnipeg, MB \n",
"2 Sagesser, Mlle. Emma PC 17477 9 NaN NaN \n",
"3 Panula, Master. Urho Abraham 3101295 NaN NaN NaN \n",
"4 Maioni, Miss. Roberta 110152 8 NaN NaN \n",
"\n",
" sex__male sex__female embarked__S embarked__C embarked__Q ... \\\n",
"0 1 0 1 0 0 ... \n",
"1 1 0 1 0 0 ... \n",
"2 0 1 0 1 0 ... \n",
"3 1 0 1 0 0 ... \n",
"4 0 1 1 0 0 ... \n",
"\n",
" fare__7.925 fare__7.225 fare__7.25 fare__8.6625 fare__0.0 fare__69.55 \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 \n",
"\n",
" fare__15.5 fare__7.8542 fare__21.0 fare____default__ \n",
"0 0 0 0 1 \n",
"1 0 0 0 1 \n",
"2 0 0 0 1 \n",
"3 0 0 0 1 \n",
"4 0 0 0 1 \n",
"\n",
"[5 rows x 80 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtrain_cat.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['name',\n",
" 'ticket',\n",
" 'boat',\n",
" 'body',\n",
" 'home_dest',\n",
" 'sex__male',\n",
" 'sex__female',\n",
" 'embarked__S',\n",
" 'embarked__C',\n",
" 'embarked__Q',\n",
" 'cabin____null__',\n",
" 'cabin____default__',\n",
" 'pclass__3',\n",
" 'pclass__1',\n",
" 'pclass__2',\n",
" 'age____null__',\n",
" 'age__24.0',\n",
" 'age__18.0',\n",
" 'age__22.0',\n",
" 'age__21.0',\n",
" 'age__30.0',\n",
" 'age__36.0',\n",
" 'age__28.0',\n",
" 'age__19.0',\n",
" 'age__27.0',\n",
" 'age__25.0',\n",
" 'age__29.0',\n",
" 'age__23.0',\n",
" 'age__31.0',\n",
" 'age__26.0',\n",
" 'age__35.0',\n",
" 'age__32.0',\n",
" 'age__33.0',\n",
" 'age__39.0',\n",
" 'age__17.0',\n",
" 'age__42.0',\n",
" 'age__45.0',\n",
" 'age__16.0',\n",
" 'age__20.0',\n",
" 'age__50.0',\n",
" 'age__40.0',\n",
" 'age__34.0',\n",
" 'age__38.0',\n",
" 'age__1.0',\n",
" 'age__47.0',\n",
" 'age____default__',\n",
" 'sibsp__0',\n",
" 'sibsp__1',\n",
" 'sibsp__2',\n",
" 'sibsp__4',\n",
" 'sibsp__3',\n",
" 'sibsp__8',\n",
" 'sibsp__5',\n",
" 'parch__0',\n",
" 'parch__1',\n",
" 'parch__2',\n",
" 'parch__5',\n",
" 'parch__4',\n",
" 'parch__3',\n",
" 'parch__9',\n",
" 'parch__6',\n",
" 'fare__13.0',\n",
" 'fare__7.75',\n",
" 'fare__8.05',\n",
" 'fare__7.8958',\n",
" 'fare__26.0',\n",
" 'fare__10.5',\n",
" 'fare__7.775',\n",
" 'fare__7.2292',\n",
" 'fare__26.55',\n",
" 'fare__7.925',\n",
" 'fare__7.225',\n",
" 'fare__7.25',\n",
" 'fare__8.6625',\n",
" 'fare__0.0',\n",
" 'fare__69.55',\n",
" 'fare__15.5',\n",
" 'fare__7.8542',\n",
" 'fare__21.0',\n",
" 'fare____default__']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(Xtrain_cat.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Target Encoder"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" ticket | \n",
" boat | \n",
" body | \n",
" home_dest | \n",
" sex__target_1 | \n",
" embarked__target_1 | \n",
" cabin__target_1 | \n",
" pclass__target_1 | \n",
" age__target_1 | \n",
" sibsp__target_1 | \n",
" parch__target_1 | \n",
" fare__target_1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" McCarthy, Mr. Timothy J | \n",
" 17463 | \n",
" NaN | \n",
" 175.0 | \n",
" Dorchester, MA | \n",
" 0.184564 | \n",
" 0.341945 | \n",
" 0.195652 | \n",
" 0.612766 | \n",
" 0.547457 | \n",
" 0.353941 | \n",
" 0.328632 | \n",
" 0.185878 | \n",
"
\n",
" \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" 19950 | \n",
" NaN | \n",
" NaN | \n",
" Winnipeg, MB | \n",
" 0.184564 | \n",
" 0.341945 | \n",
" 0.597354 | \n",
" 0.612766 | \n",
" 0.453744 | \n",
" 0.524017 | \n",
" 0.276773 | \n",
" 0.597354 | \n",
"
\n",
" \n",
" 2 | \n",
" Sagesser, Mlle. Emma | \n",
" PC 17477 | \n",
" 9 | \n",
" NaN | \n",
" NaN | \n",
" 0.746398 | \n",
" 0.576720 | \n",
" 0.695652 | \n",
" 0.612766 | \n",
" 0.481185 | \n",
" 0.353941 | \n",
" 0.328632 | \n",
" 0.695652 | \n",
"
\n",
" \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" 3101295 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.184564 | \n",
" 0.341945 | \n",
" 0.314483 | \n",
" 0.264822 | \n",
" 0.463933 | \n",
" 0.267929 | \n",
" 0.653542 | \n",
" 0.166522 | \n",
"
\n",
" \n",
" 4 | \n",
" Maioni, Miss. Roberta | \n",
" 110152 | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
" 0.746398 | \n",
" 0.341945 | \n",
" 0.391304 | \n",
" 0.612766 | \n",
" 0.427970 | \n",
" 0.353941 | \n",
" 0.328632 | \n",
" 0.710857 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name ticket boat body home_dest \\\n",
"0 McCarthy, Mr. Timothy J 17463 NaN 175.0 Dorchester, MA \n",
"1 Fortune, Mr. Mark 19950 NaN NaN Winnipeg, MB \n",
"2 Sagesser, Mlle. Emma PC 17477 9 NaN NaN \n",
"3 Panula, Master. Urho Abraham 3101295 NaN NaN NaN \n",
"4 Maioni, Miss. Roberta 110152 8 NaN NaN \n",
"\n",
" sex__target_1 embarked__target_1 cabin__target_1 pclass__target_1 \\\n",
"0 0.184564 0.341945 0.195652 0.612766 \n",
"1 0.184564 0.341945 0.597354 0.612766 \n",
"2 0.746398 0.576720 0.695652 0.612766 \n",
"3 0.184564 0.341945 0.314483 0.264822 \n",
"4 0.746398 0.341945 0.391304 0.612766 \n",
"\n",
" age__target_1 sibsp__target_1 parch__target_1 fare__target_1 \n",
"0 0.547457 0.353941 0.328632 0.185878 \n",
"1 0.453744 0.524017 0.276773 0.597354 \n",
"2 0.481185 0.353941 0.328632 0.695652 \n",
"3 0.463933 0.267929 0.653542 0.166522 \n",
"4 0.427970 0.353941 0.328632 0.710857 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.transformers import TargetEncoderClassifier\n",
"\n",
"encoder = TargetEncoderClassifier(columns_to_use=columns_block[\"CAT\"] + columns_block[\"NUM\"])\n",
"Xtrain_cat = encoder.fit_transform(Xtrain, y_train)\n",
"\n",
"Xtrain_cat.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<1048x62 sparse matrix of type ''\n",
"\twith 21936 stored elements in COOrdinate format>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.transformers import CountVectorizerWrapper\n",
"\n",
"encoder = CountVectorizerWrapper(analyzer=\"char\",columns_to_use=columns_block[\"TEXT\"])\n",
"\n",
"Xtrain_enc = encoder.fit_transform(Xtrain)\n",
"Xtrain_enc"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['name__BAG__ ',\n",
" \"name__BAG__'\",\n",
" 'name__BAG__(',\n",
" 'name__BAG__)',\n",
" 'name__BAG__,',\n",
" 'name__BAG__-',\n",
" 'name__BAG__.',\n",
" 'name__BAG__/',\n",
" 'name__BAG__a',\n",
" 'name__BAG__b',\n",
" 'name__BAG__c',\n",
" 'name__BAG__d',\n",
" 'name__BAG__e',\n",
" 'name__BAG__f',\n",
" 'name__BAG__g',\n",
" 'name__BAG__h',\n",
" 'name__BAG__i',\n",
" 'name__BAG__j',\n",
" 'name__BAG__k',\n",
" 'name__BAG__l',\n",
" 'name__BAG__m',\n",
" 'name__BAG__n',\n",
" 'name__BAG__o',\n",
" 'name__BAG__p',\n",
" 'name__BAG__q',\n",
" 'name__BAG__r',\n",
" 'name__BAG__s',\n",
" 'name__BAG__t',\n",
" 'name__BAG__u',\n",
" 'name__BAG__v',\n",
" 'name__BAG__w',\n",
" 'name__BAG__x',\n",
" 'name__BAG__y',\n",
" 'name__BAG__z',\n",
" 'ticket__BAG__ ',\n",
" 'ticket__BAG__.',\n",
" 'ticket__BAG__/',\n",
" 'ticket__BAG__0',\n",
" 'ticket__BAG__1',\n",
" 'ticket__BAG__2',\n",
" 'ticket__BAG__3',\n",
" 'ticket__BAG__4',\n",
" 'ticket__BAG__5',\n",
" 'ticket__BAG__6',\n",
" 'ticket__BAG__7',\n",
" 'ticket__BAG__8',\n",
" 'ticket__BAG__9',\n",
" 'ticket__BAG__a',\n",
" 'ticket__BAG__c',\n",
" 'ticket__BAG__e',\n",
" 'ticket__BAG__f',\n",
" 'ticket__BAG__h',\n",
" 'ticket__BAG__i',\n",
" 'ticket__BAG__l',\n",
" 'ticket__BAG__n',\n",
" 'ticket__BAG__o',\n",
" 'ticket__BAG__p',\n",
" 'ticket__BAG__q',\n",
" 'ticket__BAG__r',\n",
" 'ticket__BAG__s',\n",
" 'ticket__BAG__t',\n",
" 'ticket__BAG__w']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder.get_feature_names()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Truncated SVD"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SVD__0 | \n",
" SVD__1 | \n",
" SVD__2 | \n",
" SVD__3 | \n",
" SVD__4 | \n",
" SVD__5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.087193 | \n",
" -0.825855 | \n",
" -1.777827 | \n",
" 1.079631 | \n",
" 1.131004 | \n",
" -1.943098 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.329638 | \n",
" -1.584227 | \n",
" -1.592165 | \n",
" 0.356413 | \n",
" 0.598414 | \n",
" -0.024379 | \n",
"
\n",
" \n",
" 2 | \n",
" 5.899164 | \n",
" -0.197573 | \n",
" 2.482450 | \n",
" -0.058293 | \n",
" -1.766613 | \n",
" -0.709567 | \n",
"
\n",
" \n",
" 3 | \n",
" 7.419224 | \n",
" 0.221492 | \n",
" -2.534401 | \n",
" 2.470868 | \n",
" -1.714951 | \n",
" 0.337166 | \n",
"
\n",
" \n",
" 4 | \n",
" 5.771280 | \n",
" 1.272598 | \n",
" -0.836011 | \n",
" 0.126549 | \n",
" 1.036792 | \n",
" -0.689974 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1043 | \n",
" 7.751914 | \n",
" -0.199622 | \n",
" 0.934052 | \n",
" -0.541200 | \n",
" -1.632366 | \n",
" -0.380562 | \n",
"
\n",
" \n",
" 1044 | \n",
" 7.063821 | \n",
" -1.730135 | \n",
" -1.130117 | \n",
" -2.086049 | \n",
" 0.066611 | \n",
" -0.866250 | \n",
"
\n",
" \n",
" 1045 | \n",
" 5.595593 | \n",
" 1.029825 | \n",
" 1.534740 | \n",
" 1.149602 | \n",
" 1.487406 | \n",
" 1.612092 | \n",
"
\n",
" \n",
" 1046 | \n",
" 4.625180 | \n",
" -1.261425 | \n",
" -0.873728 | \n",
" -0.566604 | \n",
" 1.032036 | \n",
" 0.202452 | \n",
"
\n",
" \n",
" 1047 | \n",
" 5.020200 | \n",
" 1.633941 | \n",
" -2.131337 | \n",
" -0.567731 | \n",
" 0.226184 | \n",
" -1.410670 | \n",
"
\n",
" \n",
"
\n",
"
1048 rows × 6 columns
\n",
"
"
],
"text/plain": [
" SVD__0 SVD__1 SVD__2 SVD__3 SVD__4 SVD__5\n",
"0 5.087193 -0.825855 -1.777827 1.079631 1.131004 -1.943098\n",
"1 4.329638 -1.584227 -1.592165 0.356413 0.598414 -0.024379\n",
"2 5.899164 -0.197573 2.482450 -0.058293 -1.766613 -0.709567\n",
"3 7.419224 0.221492 -2.534401 2.470868 -1.714951 0.337166\n",
"4 5.771280 1.272598 -0.836011 0.126549 1.036792 -0.689974\n",
"... ... ... ... ... ... ...\n",
"1043 7.751914 -0.199622 0.934052 -0.541200 -1.632366 -0.380562\n",
"1044 7.063821 -1.730135 -1.130117 -2.086049 0.066611 -0.866250\n",
"1045 5.595593 1.029825 1.534740 1.149602 1.487406 1.612092\n",
"1046 4.625180 -1.261425 -0.873728 -0.566604 1.032036 0.202452\n",
"1047 5.020200 1.633941 -2.131337 -0.567731 0.226184 -1.410670\n",
"\n",
"[1048 rows x 6 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.transformers import TruncatedSVDWrapper\n",
"svd = TruncatedSVDWrapper(n_components=0.1)\n",
"\n",
"xx_train_small_svd = svd.fit_transform(Xtrain_enc)\n",
"xx_train_small_svd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Column selector"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" ticket | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" McCarthy, Mr. Timothy J | \n",
" 17463 | \n",
"
\n",
" \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" 19950 | \n",
"
\n",
" \n",
" 2 | \n",
" Sagesser, Mlle. Emma | \n",
" PC 17477 | \n",
"
\n",
" \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" 3101295 | \n",
"
\n",
" \n",
" 4 | \n",
" Maioni, Miss. Roberta | \n",
" 110152 | \n",
"
\n",
" \n",
" 5 | \n",
" Waelens, Mr. Achille | \n",
" 345767 | \n",
"
\n",
" \n",
" 6 | \n",
" Reed, Mr. James George | \n",
" 362316 | \n",
"
\n",
" \n",
" 7 | \n",
" Swift, Mrs. Frederick Joel (Margaret Welles Ba... | \n",
" 17466 | \n",
"
\n",
" \n",
" 8 | \n",
" Smith, Mrs. Lucien Philip (Mary Eloise Hughes) | \n",
" 13695 | \n",
"
\n",
" \n",
" 9 | \n",
" Rowe, Mr. Alfred G | \n",
" 113790 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name ticket\n",
"0 McCarthy, Mr. Timothy J 17463\n",
"1 Fortune, Mr. Mark 19950\n",
"2 Sagesser, Mlle. Emma PC 17477\n",
"3 Panula, Master. Urho Abraham 3101295\n",
"4 Maioni, Miss. Roberta 110152\n",
"5 Waelens, Mr. Achille 345767\n",
"6 Reed, Mr. James George 362316\n",
"7 Swift, Mrs. Frederick Joel (Margaret Welles Ba... 17466\n",
"8 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) 13695\n",
"9 Rowe, Mr. Alfred G 113790"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.transformers import ColumnsSelector\n",
"selector = ColumnsSelector(columns_to_use=columns_block[\"TEXT\"])\n",
"Xtrain_subset = selector.fit_transform(Xtrain)\n",
"Xtrain_subset.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}