{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Transformers" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from aikit.datasets.datasets import load_dataset,DatasetEnum\n", "Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclassnamesexagesibspparchticketfarecabinembarkedboatbodyhome_dest
01McCarthy, Mr. Timothy Jmale54.0001746351.8625E46SNaN175.0Dorchester, MA
11Fortune, Mr. Markmale64.01419950263.0000C23 C25 C27SNaNNaNWinnipeg, MB
21Sagesser, Mlle. Emmafemale24.000PC 1747769.3000B35C9NaNNaN
33Panula, Master. Urho Abrahammale2.041310129539.6875NaNSNaNNaNNaN
41Maioni, Miss. Robertafemale16.00011015286.5000B79S8NaNNaN
53Waelens, Mr. Achillemale22.0003457679.0000NaNSNaNNaNAntwerp, Belgium / Stanton, OH
63Reed, Mr. James GeorgemaleNaN003623167.2500NaNSNaNNaNNaN
71Swift, Mrs. Frederick Joel (Margaret Welles Ba...female48.0001746625.9292D17S8NaNBrooklyn, NY
81Smith, Mrs. Lucien Philip (Mary Eloise Hughes)female18.0101369560.0000C31S6NaNHuntington, WV
91Rowe, Mr. Alfred Gmale33.00011379026.5500NaNSNaN109.0London
103Meo, Mr. Alfonzomale55.500A.5. 112068.0500NaNSNaN201.0NaN
113Abbott, Mr. Rossmore Edwardmale16.011C.A. 267320.2500NaNSNaN190.0East Providence, RI
123Elias, Mr. DibomaleNaN0026747.2250NaNCNaNNaNNaN
132Reynaldo, Ms. Encarnacionfemale28.00023043413.0000NaNS9NaNSpain
143Khalil, Mr. BetrosmaleNaN10266014.4542NaNCNaNNaNNaN
151Daniels, Miss. Sarahfemale33.000113781151.5500NaNS8NaNNaN
163Ford, Miss. Robina Maggie 'Ruby'female9.022W./C. 660834.3750NaNSNaNNaNRotherfield, Sussex, England Essex Co, MA
173Thorneycroft, Mrs. Percival (Florence Kate White)femaleNaN1037656416.1000NaNS10NaNNaN
183Lennon, Mr. DenismaleNaN1037037115.5000NaNQNaNNaNNaN
193de Pelsmaeker, Mr. Alfonsmale16.0003457789.5000NaNSNaNNaNNaN
\n", "
" ], "text/plain": [ " pclass name sex age \\\n", "0 1 McCarthy, Mr. Timothy J male 54.0 \n", "1 1 Fortune, Mr. Mark male 64.0 \n", "2 1 Sagesser, Mlle. Emma female 24.0 \n", "3 3 Panula, Master. Urho Abraham male 2.0 \n", "4 1 Maioni, Miss. Roberta female 16.0 \n", "5 3 Waelens, Mr. Achille male 22.0 \n", "6 3 Reed, Mr. James George male NaN \n", "7 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 \n", "8 1 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) female 18.0 \n", "9 1 Rowe, Mr. Alfred G male 33.0 \n", "10 3 Meo, Mr. Alfonzo male 55.5 \n", "11 3 Abbott, Mr. Rossmore Edward male 16.0 \n", "12 3 Elias, Mr. Dibo male NaN \n", "13 2 Reynaldo, Ms. Encarnacion female 28.0 \n", "14 3 Khalil, Mr. Betros male NaN \n", "15 1 Daniels, Miss. Sarah female 33.0 \n", "16 3 Ford, Miss. Robina Maggie 'Ruby' female 9.0 \n", "17 3 Thorneycroft, Mrs. Percival (Florence Kate White) female NaN \n", "18 3 Lennon, Mr. Denis male NaN \n", "19 3 de Pelsmaeker, Mr. Alfons male 16.0 \n", "\n", " sibsp parch ticket fare cabin embarked boat body \\\n", "0 0 0 17463 51.8625 E46 S NaN 175.0 \n", "1 1 4 19950 263.0000 C23 C25 C27 S NaN NaN \n", "2 0 0 PC 17477 69.3000 B35 C 9 NaN \n", "3 4 1 3101295 39.6875 NaN S NaN NaN \n", "4 0 0 110152 86.5000 B79 S 8 NaN \n", "5 0 0 345767 9.0000 NaN S NaN NaN \n", "6 0 0 362316 7.2500 NaN S NaN NaN \n", "7 0 0 17466 25.9292 D17 S 8 NaN \n", "8 1 0 13695 60.0000 C31 S 6 NaN \n", "9 0 0 113790 26.5500 NaN S NaN 109.0 \n", "10 0 0 A.5. 11206 8.0500 NaN S NaN 201.0 \n", "11 1 1 C.A. 2673 20.2500 NaN S NaN 190.0 \n", "12 0 0 2674 7.2250 NaN C NaN NaN \n", "13 0 0 230434 13.0000 NaN S 9 NaN \n", "14 1 0 2660 14.4542 NaN C NaN NaN \n", "15 0 0 113781 151.5500 NaN S 8 NaN \n", "16 2 2 W./C. 6608 34.3750 NaN S NaN NaN \n", "17 1 0 376564 16.1000 NaN S 10 NaN \n", "18 1 0 370371 15.5000 NaN Q NaN NaN \n", "19 0 0 345778 9.5000 NaN S NaN NaN \n", "\n", " home_dest \n", "0 Dorchester, MA \n", "1 Winnipeg, MB \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "5 Antwerp, Belgium / Stanton, OH \n", "6 NaN \n", "7 Brooklyn, NY \n", "8 Huntington, WV \n", "9 London \n", "10 NaN \n", "11 East Providence, RI \n", "12 NaN \n", "13 Spain \n", "14 NaN \n", "15 NaN \n", "16 Rotherfield, Sussex, England Essex Co, MA \n", "17 NaN \n", "18 NaN \n", "19 NaN " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xtrain.head(20)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'CAT': ['sex', 'embarked', 'cabin'],\n", " 'NUM': ['pclass', 'age', 'sibsp', 'parch', 'fare'],\n", " 'TEXT': ['name', 'ticket']}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns_block = {\"CAT\":[\"sex\",\"embarked\",\"cabin\"],\n", " \"NUM\":[\"pclass\",\"age\",\"sibsp\",\"parch\",\"fare\"],\n", " \"TEXT\":[\"name\",\"ticket\"]}\n", "columns_block" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Numerical Encoder" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from aikit.transformers import NumericalEncoder\n", "\n", "encoder = NumericalEncoder(columns_to_use=columns_block[\"CAT\"] + columns_block[\"NUM\"])\n", "Xtrain_cat = encoder.fit_transform(Xtrain)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameticketboatbodyhome_destsex__malesex__femaleembarked__Sembarked__Cembarked__Q...fare__7.925fare__7.225fare__7.25fare__8.6625fare__0.0fare__69.55fare__15.5fare__7.8542fare__21.0fare____default__
0McCarthy, Mr. Timothy J17463NaN175.0Dorchester, MA10100...0000000001
1Fortune, Mr. Mark19950NaNNaNWinnipeg, MB10100...0000000001
2Sagesser, Mlle. EmmaPC 174779NaNNaN01010...0000000001
3Panula, Master. Urho Abraham3101295NaNNaNNaN10100...0000000001
4Maioni, Miss. Roberta1101528NaNNaN01100...0000000001
\n", "

5 rows × 80 columns

\n", "
" ], "text/plain": [ " name ticket boat body home_dest \\\n", "0 McCarthy, Mr. Timothy J 17463 NaN 175.0 Dorchester, MA \n", "1 Fortune, Mr. Mark 19950 NaN NaN Winnipeg, MB \n", "2 Sagesser, Mlle. Emma PC 17477 9 NaN NaN \n", "3 Panula, Master. Urho Abraham 3101295 NaN NaN NaN \n", "4 Maioni, Miss. Roberta 110152 8 NaN NaN \n", "\n", " sex__male sex__female embarked__S embarked__C embarked__Q ... \\\n", "0 1 0 1 0 0 ... \n", "1 1 0 1 0 0 ... \n", "2 0 1 0 1 0 ... \n", "3 1 0 1 0 0 ... \n", "4 0 1 1 0 0 ... \n", "\n", " fare__7.925 fare__7.225 fare__7.25 fare__8.6625 fare__0.0 fare__69.55 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", "\n", " fare__15.5 fare__7.8542 fare__21.0 fare____default__ \n", "0 0 0 0 1 \n", "1 0 0 0 1 \n", "2 0 0 0 1 \n", "3 0 0 0 1 \n", "4 0 0 0 1 \n", "\n", "[5 rows x 80 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xtrain_cat.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['name',\n", " 'ticket',\n", " 'boat',\n", " 'body',\n", " 'home_dest',\n", " 'sex__male',\n", " 'sex__female',\n", " 'embarked__S',\n", " 'embarked__C',\n", " 'embarked__Q',\n", " 'cabin____null__',\n", " 'cabin____default__',\n", " 'pclass__3',\n", " 'pclass__1',\n", " 'pclass__2',\n", " 'age____null__',\n", " 'age__24.0',\n", " 'age__18.0',\n", " 'age__22.0',\n", " 'age__21.0',\n", " 'age__30.0',\n", " 'age__36.0',\n", " 'age__28.0',\n", " 'age__19.0',\n", " 'age__27.0',\n", " 'age__25.0',\n", " 'age__29.0',\n", " 'age__23.0',\n", " 'age__31.0',\n", " 'age__26.0',\n", " 'age__35.0',\n", " 'age__32.0',\n", " 'age__33.0',\n", " 'age__39.0',\n", " 'age__17.0',\n", " 'age__42.0',\n", " 'age__45.0',\n", " 'age__16.0',\n", " 'age__20.0',\n", " 'age__50.0',\n", " 'age__40.0',\n", " 'age__34.0',\n", " 'age__38.0',\n", " 'age__1.0',\n", " 'age__47.0',\n", " 'age____default__',\n", " 'sibsp__0',\n", " 'sibsp__1',\n", " 'sibsp__2',\n", " 'sibsp__4',\n", " 'sibsp__3',\n", " 'sibsp__8',\n", " 'sibsp__5',\n", " 'parch__0',\n", " 'parch__1',\n", " 'parch__2',\n", " 'parch__5',\n", " 'parch__4',\n", " 'parch__3',\n", " 'parch__9',\n", " 'parch__6',\n", " 'fare__13.0',\n", " 'fare__7.75',\n", " 'fare__8.05',\n", " 'fare__7.8958',\n", " 'fare__26.0',\n", " 'fare__10.5',\n", " 'fare__7.775',\n", " 'fare__7.2292',\n", " 'fare__26.55',\n", " 'fare__7.925',\n", " 'fare__7.225',\n", " 'fare__7.25',\n", " 'fare__8.6625',\n", " 'fare__0.0',\n", " 'fare__69.55',\n", " 'fare__15.5',\n", " 'fare__7.8542',\n", " 'fare__21.0',\n", " 'fare____default__']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(Xtrain_cat.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Target Encoder" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameticketboatbodyhome_destsex__target_1embarked__target_1cabin__target_1pclass__target_1age__target_1sibsp__target_1parch__target_1fare__target_1
0McCarthy, Mr. Timothy J17463NaN175.0Dorchester, MA0.1845640.3419450.1956520.6127660.5474570.3539410.3286320.185878
1Fortune, Mr. Mark19950NaNNaNWinnipeg, MB0.1845640.3419450.5973540.6127660.4537440.5240170.2767730.597354
2Sagesser, Mlle. EmmaPC 174779NaNNaN0.7463980.5767200.6956520.6127660.4811850.3539410.3286320.695652
3Panula, Master. Urho Abraham3101295NaNNaNNaN0.1845640.3419450.3144830.2648220.4639330.2679290.6535420.166522
4Maioni, Miss. Roberta1101528NaNNaN0.7463980.3419450.3913040.6127660.4279700.3539410.3286320.710857
\n", "
" ], "text/plain": [ " name ticket boat body home_dest \\\n", "0 McCarthy, Mr. Timothy J 17463 NaN 175.0 Dorchester, MA \n", "1 Fortune, Mr. Mark 19950 NaN NaN Winnipeg, MB \n", "2 Sagesser, Mlle. Emma PC 17477 9 NaN NaN \n", "3 Panula, Master. Urho Abraham 3101295 NaN NaN NaN \n", "4 Maioni, Miss. Roberta 110152 8 NaN NaN \n", "\n", " sex__target_1 embarked__target_1 cabin__target_1 pclass__target_1 \\\n", "0 0.184564 0.341945 0.195652 0.612766 \n", "1 0.184564 0.341945 0.597354 0.612766 \n", "2 0.746398 0.576720 0.695652 0.612766 \n", "3 0.184564 0.341945 0.314483 0.264822 \n", "4 0.746398 0.341945 0.391304 0.612766 \n", "\n", " age__target_1 sibsp__target_1 parch__target_1 fare__target_1 \n", "0 0.547457 0.353941 0.328632 0.185878 \n", "1 0.453744 0.524017 0.276773 0.597354 \n", "2 0.481185 0.353941 0.328632 0.695652 \n", "3 0.463933 0.267929 0.653542 0.166522 \n", "4 0.427970 0.353941 0.328632 0.710857 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from aikit.transformers import TargetEncoderClassifier\n", "\n", "encoder = TargetEncoderClassifier(columns_to_use=columns_block[\"CAT\"] + columns_block[\"NUM\"])\n", "Xtrain_cat = encoder.fit_transform(Xtrain, y_train)\n", "\n", "Xtrain_cat.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CountVectorizer" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<1048x62 sparse matrix of type ''\n", "\twith 21936 stored elements in COOrdinate format>" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from aikit.transformers import CountVectorizerWrapper\n", "\n", "encoder = CountVectorizerWrapper(analyzer=\"char\",columns_to_use=columns_block[\"TEXT\"])\n", "\n", "Xtrain_enc = encoder.fit_transform(Xtrain)\n", "Xtrain_enc" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['name__BAG__ ',\n", " \"name__BAG__'\",\n", " 'name__BAG__(',\n", " 'name__BAG__)',\n", " 'name__BAG__,',\n", " 'name__BAG__-',\n", " 'name__BAG__.',\n", " 'name__BAG__/',\n", " 'name__BAG__a',\n", " 'name__BAG__b',\n", " 'name__BAG__c',\n", " 'name__BAG__d',\n", " 'name__BAG__e',\n", " 'name__BAG__f',\n", " 'name__BAG__g',\n", " 'name__BAG__h',\n", " 'name__BAG__i',\n", " 'name__BAG__j',\n", " 'name__BAG__k',\n", " 'name__BAG__l',\n", " 'name__BAG__m',\n", " 'name__BAG__n',\n", " 'name__BAG__o',\n", " 'name__BAG__p',\n", " 'name__BAG__q',\n", " 'name__BAG__r',\n", " 'name__BAG__s',\n", " 'name__BAG__t',\n", " 'name__BAG__u',\n", " 'name__BAG__v',\n", " 'name__BAG__w',\n", " 'name__BAG__x',\n", " 'name__BAG__y',\n", " 'name__BAG__z',\n", " 'ticket__BAG__ ',\n", " 'ticket__BAG__.',\n", " 'ticket__BAG__/',\n", " 'ticket__BAG__0',\n", " 'ticket__BAG__1',\n", " 'ticket__BAG__2',\n", " 'ticket__BAG__3',\n", " 'ticket__BAG__4',\n", " 'ticket__BAG__5',\n", " 'ticket__BAG__6',\n", " 'ticket__BAG__7',\n", " 'ticket__BAG__8',\n", " 'ticket__BAG__9',\n", " 'ticket__BAG__a',\n", " 'ticket__BAG__c',\n", " 'ticket__BAG__e',\n", " 'ticket__BAG__f',\n", " 'ticket__BAG__h',\n", " 'ticket__BAG__i',\n", " 'ticket__BAG__l',\n", " 'ticket__BAG__n',\n", " 'ticket__BAG__o',\n", " 'ticket__BAG__p',\n", " 'ticket__BAG__q',\n", " 'ticket__BAG__r',\n", " 'ticket__BAG__s',\n", " 'ticket__BAG__t',\n", " 'ticket__BAG__w']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder.get_feature_names()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Truncated SVD" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SVD__0SVD__1SVD__2SVD__3SVD__4SVD__5
05.087193-0.825855-1.7778271.0796311.131004-1.943098
14.329638-1.584227-1.5921650.3564130.598414-0.024379
25.899164-0.1975732.482450-0.058293-1.766613-0.709567
37.4192240.221492-2.5344012.470868-1.7149510.337166
45.7712801.272598-0.8360110.1265491.036792-0.689974
.....................
10437.751914-0.1996220.934052-0.541200-1.632366-0.380562
10447.063821-1.730135-1.130117-2.0860490.066611-0.866250
10455.5955931.0298251.5347401.1496021.4874061.612092
10464.625180-1.261425-0.873728-0.5666041.0320360.202452
10475.0202001.633941-2.131337-0.5677310.226184-1.410670
\n", "

1048 rows × 6 columns

\n", "
" ], "text/plain": [ " SVD__0 SVD__1 SVD__2 SVD__3 SVD__4 SVD__5\n", "0 5.087193 -0.825855 -1.777827 1.079631 1.131004 -1.943098\n", "1 4.329638 -1.584227 -1.592165 0.356413 0.598414 -0.024379\n", "2 5.899164 -0.197573 2.482450 -0.058293 -1.766613 -0.709567\n", "3 7.419224 0.221492 -2.534401 2.470868 -1.714951 0.337166\n", "4 5.771280 1.272598 -0.836011 0.126549 1.036792 -0.689974\n", "... ... ... ... ... ... ...\n", "1043 7.751914 -0.199622 0.934052 -0.541200 -1.632366 -0.380562\n", "1044 7.063821 -1.730135 -1.130117 -2.086049 0.066611 -0.866250\n", "1045 5.595593 1.029825 1.534740 1.149602 1.487406 1.612092\n", "1046 4.625180 -1.261425 -0.873728 -0.566604 1.032036 0.202452\n", "1047 5.020200 1.633941 -2.131337 -0.567731 0.226184 -1.410670\n", "\n", "[1048 rows x 6 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from aikit.transformers import TruncatedSVDWrapper\n", "svd = TruncatedSVDWrapper(n_components=0.1)\n", "\n", "xx_train_small_svd = svd.fit_transform(Xtrain_enc)\n", "xx_train_small_svd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Column selector" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameticket
0McCarthy, Mr. Timothy J17463
1Fortune, Mr. Mark19950
2Sagesser, Mlle. EmmaPC 17477
3Panula, Master. Urho Abraham3101295
4Maioni, Miss. Roberta110152
5Waelens, Mr. Achille345767
6Reed, Mr. James George362316
7Swift, Mrs. Frederick Joel (Margaret Welles Ba...17466
8Smith, Mrs. Lucien Philip (Mary Eloise Hughes)13695
9Rowe, Mr. Alfred G113790
\n", "
" ], "text/plain": [ " name ticket\n", "0 McCarthy, Mr. Timothy J 17463\n", "1 Fortune, Mr. Mark 19950\n", "2 Sagesser, Mlle. Emma PC 17477\n", "3 Panula, Master. Urho Abraham 3101295\n", "4 Maioni, Miss. Roberta 110152\n", "5 Waelens, Mr. Achille 345767\n", "6 Reed, Mr. James George 362316\n", "7 Swift, Mrs. Frederick Joel (Margaret Welles Ba... 17466\n", "8 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) 13695\n", "9 Rowe, Mr. Alfred G 113790" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from aikit.transformers import ColumnsSelector\n", "selector = ColumnsSelector(columns_to_use=columns_block[\"TEXT\"])\n", "Xtrain_subset = selector.fit_transform(Xtrain)\n", "Xtrain_subset.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }