{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Choice of columns" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\HOMEWARE\\Anaconda3-Windows-x86_64\\lib\\site-packages\\gensim\\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n", " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n" ] } ], "source": [ "from aikit.datasets.datasets import load_dataset, DatasetEnum\n", "Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)\n", "\n", "from aikit.transformers import NumericalEncoder\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclassnamesexagesibspparchticketfarecabinembarkedboatbodyhome_dest
01McCarthy, Mr. Timothy Jmale54.0001746351.8625E46SNaN175.0Dorchester, MA
11Fortune, Mr. Markmale64.01419950263.0000C23 C25 C27SNaNNaNWinnipeg, MB
21Sagesser, Mlle. Emmafemale24.000PC 1747769.3000B35C9NaNNaN
33Panula, Master. Urho Abrahammale2.041310129539.6875NaNSNaNNaNNaN
41Maioni, Miss. Robertafemale16.00011015286.5000B79S8NaNNaN
..........................................
10432Sobey, Mr. Samuel James Haydenmale25.000C.A. 2917813.0000NaNSNaNNaNCornwall / Houghton, MI
10441Ryerson, Master. John Boriemale13.022PC 17608262.3750B57 B59 B63 B66C4NaNHaverford, PA / Cooperstown, NY
10452Lahtinen, Rev. Williammale30.01125065126.0000NaNSNaNNaNMinneapolis, MN
10463Drazenoic, Mr. Jozefmale33.0003492417.8958NaNCNaN51.0Austria Niagara Falls, NY
10472Hosono, Mr. Masabumimale42.00023779813.0000NaNS10NaNTokyo, Japan
\n", "

1048 rows × 13 columns

\n", "
" ], "text/plain": [ " pclass name sex age sibsp parch \\\n", "0 1 McCarthy, Mr. Timothy J male 54.0 0 0 \n", "1 1 Fortune, Mr. Mark male 64.0 1 4 \n", "2 1 Sagesser, Mlle. Emma female 24.0 0 0 \n", "3 3 Panula, Master. Urho Abraham male 2.0 4 1 \n", "4 1 Maioni, Miss. Roberta female 16.0 0 0 \n", "... ... ... ... ... ... ... \n", "1043 2 Sobey, Mr. Samuel James Hayden male 25.0 0 0 \n", "1044 1 Ryerson, Master. John Borie male 13.0 2 2 \n", "1045 2 Lahtinen, Rev. William male 30.0 1 1 \n", "1046 3 Drazenoic, Mr. Jozef male 33.0 0 0 \n", "1047 2 Hosono, Mr. Masabumi male 42.0 0 0 \n", "\n", " ticket fare cabin embarked boat body \\\n", "0 17463 51.8625 E46 S NaN 175.0 \n", "1 19950 263.0000 C23 C25 C27 S NaN NaN \n", "2 PC 17477 69.3000 B35 C 9 NaN \n", "3 3101295 39.6875 NaN S NaN NaN \n", "4 110152 86.5000 B79 S 8 NaN \n", "... ... ... ... ... ... ... \n", "1043 C.A. 29178 13.0000 NaN S NaN NaN \n", "1044 PC 17608 262.3750 B57 B59 B63 B66 C 4 NaN \n", "1045 250651 26.0000 NaN S NaN NaN \n", "1046 349241 7.8958 NaN C NaN 51.0 \n", "1047 237798 13.0000 NaN S 10 NaN \n", "\n", " home_dest \n", "0 Dorchester, MA \n", "1 Winnipeg, MB \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... \n", "1043 Cornwall / Houghton, MI \n", "1044 Haverford, PA / Cooperstown, NY \n", "1045 Minneapolis, MN \n", "1046 Austria Niagara Falls, NY \n", "1047 Tokyo, Japan \n", "\n", "[1048 rows x 13 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xtrain" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclassnameagesibspparchticketfarecabinembarkedboatbodysex__malesex__femalehome_dest____null__home_dest__New York, NYhome_dest__Londonhome_dest____default__
01McCarthy, Mr. Timothy J54.0001746351.8625E46SNaN175.0100001
11Fortune, Mr. Mark64.01419950263.0000C23 C25 C27SNaNNaN100001
21Sagesser, Mlle. Emma24.000PC 1747769.3000B35C9NaN011000
33Panula, Master. Urho Abraham2.041310129539.6875NaNSNaNNaN101000
41Maioni, Miss. Roberta16.00011015286.5000B79S8NaN011000
\n", "
" ], "text/plain": [ " pclass name age sibsp parch ticket \\\n", "0 1 McCarthy, Mr. Timothy J 54.0 0 0 17463 \n", "1 1 Fortune, Mr. Mark 64.0 1 4 19950 \n", "2 1 Sagesser, Mlle. Emma 24.0 0 0 PC 17477 \n", "3 3 Panula, Master. Urho Abraham 2.0 4 1 3101295 \n", "4 1 Maioni, Miss. Roberta 16.0 0 0 110152 \n", "\n", " fare cabin embarked boat body sex__male sex__female \\\n", "0 51.8625 E46 S NaN 175.0 1 0 \n", "1 263.0000 C23 C25 C27 S NaN NaN 1 0 \n", "2 69.3000 B35 C 9 NaN 0 1 \n", "3 39.6875 NaN S NaN NaN 1 0 \n", "4 86.5000 B79 S 8 NaN 0 1 \n", "\n", " home_dest____null__ home_dest__New York, NY home_dest__London \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 1 0 0 \n", "3 1 0 0 \n", "4 1 0 0 \n", "\n", " home_dest____default__ \n", "0 1 \n", "1 1 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"])\n", "Xencoded = encoder.fit_transform(Xtrain)\n", "Xencoded.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Called like this the transformer encodes \"sex\" and \"home_dest\" and keeps th other columns untouched" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1048, 6)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sex__malesex__femalehome_dest____null__home_dest__New York, NYhome_dest__Londonhome_dest____default__
0100001
1100001
2011000
3101000
4011000
\n", "
" ], "text/plain": [ " sex__male sex__female home_dest____null__ home_dest__New York, NY \\\n", "0 1 0 0 0 \n", "1 1 0 0 0 \n", "2 0 1 1 0 \n", "3 1 0 1 0 \n", "4 0 1 1 0 \n", "\n", " home_dest__London home_dest____default__ \n", "0 0 1 \n", "1 0 1 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"], drop_unused_columns=True)\n", "Xencoded = encoder.fit_transform(Xtrain)\n", "print(Xencoded.shape)\n", "Xencoded.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Called like this the transformer encodes \"sex\" and \"home_dest\" and drop the other columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "New York, NY 47\n", "London 11\n", "Cornwall / Akron, OH 9\n", "Winnipeg, MB 7\n", "Montreal, PQ 7\n", " ..\n", "London / Birmingham 1\n", "Folkstone, Kent / New York, NY 1\n", "Treherbert, Cardiff, Wales 1\n", "Devonport, England 1\n", "Buenos Aires, Argentina / New Jersey, NJ 1\n", "Name: home_dest, Length: 333, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xtrain[\"home_dest\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Only the most frequent modalities are kept (this can be changed)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1048, 336)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sex__malesex__femalehome_dest____null__home_dest__New York, NYhome_dest__Londonhome_dest__Cornwall / Akron, OHhome_dest__Winnipeg, MBhome_dest__Montreal, PQhome_dest__Philadelphia, PAhome_dest__Paris, France...home_dest__Deer Lodge, MThome_dest__Bristol, England / New Britain, CThome_dest__Holley, NYhome_dest__Bryn Mawr, PA, USAhome_dest__Tokyo, Japanhome_dest__Oslo, Norway Cameron, WIhome_dest__Cambridge, MAhome_dest__Ireland Brooklyn, NYhome_dest__Englandhome_dest__Aughnacliff, Co Longford, Ireland New York, NY
01000000000...0000000000
11000001000...0000000000
20110000000...0000000000
31010000000...0000000000
40110000000...0000000000
\n", "

5 rows × 336 columns

\n", "
" ], "text/plain": [ " sex__male sex__female home_dest____null__ home_dest__New York, NY \\\n", "0 1 0 0 0 \n", "1 1 0 0 0 \n", "2 0 1 1 0 \n", "3 1 0 1 0 \n", "4 0 1 1 0 \n", "\n", " home_dest__London home_dest__Cornwall / Akron, OH \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " home_dest__Winnipeg, MB home_dest__Montreal, PQ \\\n", "0 0 0 \n", "1 1 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " home_dest__Philadelphia, PA home_dest__Paris, France ... \\\n", "0 0 0 ... \n", "1 0 0 ... \n", "2 0 0 ... \n", "3 0 0 ... \n", "4 0 0 ... \n", "\n", " home_dest__Deer Lodge, MT home_dest__Bristol, England / New Britain, CT \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " home_dest__Holley, NY home_dest__Bryn Mawr, PA, USA \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " home_dest__Tokyo, Japan home_dest__Oslo, Norway Cameron, WI \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " home_dest__Cambridge, MA home_dest__Ireland Brooklyn, NY \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " home_dest__England \\\n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", " home_dest__Aughnacliff, Co Longford, Ireland New York, NY \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", "[5 rows x 336 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"],\n", " drop_unused_columns=True,\n", " min_modalities_number=400)\n", "Xencoded = encoder.fit_transform(Xtrain)\n", "print(Xencoded.shape)\n", "Xencoded.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If I specify 'min_modalities_number': 400, all the modalities are kept.\n", "I'll start filtering the modalities only if I have more than 400)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1048, 19)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclassnamesexagesibspparchticketfarecabinembarkedboatbodyhome_destsex__malesex__femalehome_dest____null__home_dest__New York, NYhome_dest__Londonhome_dest____default__
01McCarthy, Mr. Timothy Jmale54.0001746351.8625E46SNaN175.0Dorchester, MA100001
11Fortune, Mr. Markmale64.01419950263.0000C23 C25 C27SNaNNaNWinnipeg, MB100001
21Sagesser, Mlle. Emmafemale24.000PC 1747769.3000B35C9NaNNaN011000
33Panula, Master. Urho Abrahammale2.041310129539.6875NaNSNaNNaNNaN101000
41Maioni, Miss. Robertafemale16.00011015286.5000B79S8NaNNaN011000
\n", "
" ], "text/plain": [ " pclass name sex age sibsp parch ticket \\\n", "0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 \n", "1 1 Fortune, Mr. Mark male 64.0 1 4 19950 \n", "2 1 Sagesser, Mlle. Emma female 24.0 0 0 PC 17477 \n", "3 3 Panula, Master. Urho Abraham male 2.0 4 1 3101295 \n", "4 1 Maioni, Miss. Roberta female 16.0 0 0 110152 \n", "\n", " fare cabin embarked boat body home_dest sex__male \\\n", "0 51.8625 E46 S NaN 175.0 Dorchester, MA 1 \n", "1 263.0000 C23 C25 C27 S NaN NaN Winnipeg, MB 1 \n", "2 69.3000 B35 C 9 NaN NaN 0 \n", "3 39.6875 NaN S NaN NaN NaN 1 \n", "4 86.5000 B79 S 8 NaN NaN 0 \n", "\n", " sex__female home_dest____null__ home_dest__New York, NY \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 1 1 0 \n", "3 0 1 0 \n", "4 1 1 0 \n", "\n", " home_dest__London home_dest____default__ \n", "0 0 1 \n", "1 0 1 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"], drop_used_columns=False)\n", "Xencoded = encoder.fit_transform(Xtrain)\n", "print(Xencoded.shape)\n", "Xencoded.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Called like this the transformer encodes \"sex\" and \"home_dest\" but also keep them in the final result" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SVD__0SVD__1
02.075079-0.858158
10.332307-0.970121
22.2794171.340435
3-0.5634420.551599
4-1.640313-1.569441
\n", "
" ], "text/plain": [ " SVD__0 SVD__1\n", "0 2.075079 -0.858158\n", "1 0.332307 -0.970121\n", "2 2.279417 1.340435\n", "3 -0.563442 0.551599\n", "4 -1.640313 -1.569441" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from aikit.transformers import TruncatedSVDWrapper\n", "\n", "X = pd.DataFrame(np.random.randn(100,20), columns=[f\"COL_{j}\" for j in range(20)])\n", "\n", "svd = TruncatedSVDWrapper(n_components=2, drop_used_columns=True)\n", "Xencoded = svd.fit_transform(X)\n", "Xencoded.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COL_0COL_1COL_2COL_3COL_4COL_5COL_6COL_7COL_8COL_9...COL_12COL_13COL_14COL_15COL_16COL_17COL_18COL_19SVD__0SVD__1
00.858982-0.655989-0.028417-0.3573980.5695310.1458160.5523681.9834381.092890-0.453562...0.285189-0.604234-1.053623-0.291745-1.646335-0.2155310.0085001.1002972.076530-0.854836
1-0.1879360.0416840.9419441.8989250.1791250.6364182.0501730.229349-1.9103680.702720...-0.533445-0.371779-0.4012050.231492-1.0431761.8423880.3292710.8820170.346758-0.951460
21.097298-0.136058-0.323606-1.096158-0.009371-0.9452671.455854-0.1081601.141867-1.407562...2.3101532.414735-0.184708-1.486121-0.676003-0.686621-0.8368300.9729782.3303891.407472
30.9289340.269935-1.274605-0.2870770.279328-0.3208710.802277-0.713909-1.0392501.227245...0.0202980.259960-0.8853200.0148200.268819-0.4324351.2541640.031453-0.5720560.539412
4-0.7144671.637883-0.4513130.4099560.5659260.448906-0.128214-0.8453200.433473-0.416148...0.758863-1.702709-0.000005-0.293631-0.859405-0.1670670.400996-1.095900-1.603850-1.532443
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " COL_0 COL_1 COL_2 COL_3 COL_4 COL_5 COL_6 \\\n", "0 0.858982 -0.655989 -0.028417 -0.357398 0.569531 0.145816 0.552368 \n", "1 -0.187936 0.041684 0.941944 1.898925 0.179125 0.636418 2.050173 \n", "2 1.097298 -0.136058 -0.323606 -1.096158 -0.009371 -0.945267 1.455854 \n", "3 0.928934 0.269935 -1.274605 -0.287077 0.279328 -0.320871 0.802277 \n", "4 -0.714467 1.637883 -0.451313 0.409956 0.565926 0.448906 -0.128214 \n", "\n", " COL_7 COL_8 COL_9 ... COL_12 COL_13 COL_14 COL_15 \\\n", "0 1.983438 1.092890 -0.453562 ... 0.285189 -0.604234 -1.053623 -0.291745 \n", "1 0.229349 -1.910368 0.702720 ... -0.533445 -0.371779 -0.401205 0.231492 \n", "2 -0.108160 1.141867 -1.407562 ... 2.310153 2.414735 -0.184708 -1.486121 \n", "3 -0.713909 -1.039250 1.227245 ... 0.020298 0.259960 -0.885320 0.014820 \n", "4 -0.845320 0.433473 -0.416148 ... 0.758863 -1.702709 -0.000005 -0.293631 \n", "\n", " COL_16 COL_17 COL_18 COL_19 SVD__0 SVD__1 \n", "0 -1.646335 -0.215531 0.008500 1.100297 2.076530 -0.854836 \n", "1 -1.043176 1.842388 0.329271 0.882017 0.346758 -0.951460 \n", "2 -0.676003 -0.686621 -0.836830 0.972978 2.330389 1.407472 \n", "3 0.268819 -0.432435 1.254164 0.031453 -0.572056 0.539412 \n", "4 -0.859405 -0.167067 0.400996 -1.095900 -1.603850 -1.532443 \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "svd = TruncatedSVDWrapper(n_components=2, drop_used_columns=False)\n", "Xencoded = svd.fit_transform(X)\n", "Xencoded.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another example of the usage of 'drop_used_columns' and 'drop_unused_columns':\n", " * in the first case (drop_used_columns = True) : only the SVD columns are retrieved\n", " * in the second case (drop_used_columns = False) : I retrive the original columns AND the svd columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }