{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Choice of columns"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\HOMEWARE\\Anaconda3-Windows-x86_64\\lib\\site-packages\\gensim\\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
" warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
]
}
],
"source": [
"from aikit.datasets.datasets import load_dataset, DatasetEnum\n",
"Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)\n",
"\n",
"from aikit.transformers import NumericalEncoder\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home_dest | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" McCarthy, Mr. Timothy J | \n",
" male | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" 17463 | \n",
" 51.8625 | \n",
" E46 | \n",
" S | \n",
" NaN | \n",
" 175.0 | \n",
" Dorchester, MA | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" male | \n",
" 64.0 | \n",
" 1 | \n",
" 4 | \n",
" 19950 | \n",
" 263.0000 | \n",
" C23 C25 C27 | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Winnipeg, MB | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" Sagesser, Mlle. Emma | \n",
" female | \n",
" 24.0 | \n",
" 0 | \n",
" 0 | \n",
" PC 17477 | \n",
" 69.3000 | \n",
" B35 | \n",
" C | \n",
" 9 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" male | \n",
" 2.0 | \n",
" 4 | \n",
" 1 | \n",
" 3101295 | \n",
" 39.6875 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" Maioni, Miss. Roberta | \n",
" female | \n",
" 16.0 | \n",
" 0 | \n",
" 0 | \n",
" 110152 | \n",
" 86.5000 | \n",
" B79 | \n",
" S | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1043 | \n",
" 2 | \n",
" Sobey, Mr. Samuel James Hayden | \n",
" male | \n",
" 25.0 | \n",
" 0 | \n",
" 0 | \n",
" C.A. 29178 | \n",
" 13.0000 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Cornwall / Houghton, MI | \n",
"
\n",
" \n",
" 1044 | \n",
" 1 | \n",
" Ryerson, Master. John Borie | \n",
" male | \n",
" 13.0 | \n",
" 2 | \n",
" 2 | \n",
" PC 17608 | \n",
" 262.3750 | \n",
" B57 B59 B63 B66 | \n",
" C | \n",
" 4 | \n",
" NaN | \n",
" Haverford, PA / Cooperstown, NY | \n",
"
\n",
" \n",
" 1045 | \n",
" 2 | \n",
" Lahtinen, Rev. William | \n",
" male | \n",
" 30.0 | \n",
" 1 | \n",
" 1 | \n",
" 250651 | \n",
" 26.0000 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Minneapolis, MN | \n",
"
\n",
" \n",
" 1046 | \n",
" 3 | \n",
" Drazenoic, Mr. Jozef | \n",
" male | \n",
" 33.0 | \n",
" 0 | \n",
" 0 | \n",
" 349241 | \n",
" 7.8958 | \n",
" NaN | \n",
" C | \n",
" NaN | \n",
" 51.0 | \n",
" Austria Niagara Falls, NY | \n",
"
\n",
" \n",
" 1047 | \n",
" 2 | \n",
" Hosono, Mr. Masabumi | \n",
" male | \n",
" 42.0 | \n",
" 0 | \n",
" 0 | \n",
" 237798 | \n",
" 13.0000 | \n",
" NaN | \n",
" S | \n",
" 10 | \n",
" NaN | \n",
" Tokyo, Japan | \n",
"
\n",
" \n",
"
\n",
"
1048 rows × 13 columns
\n",
"
"
],
"text/plain": [
" pclass name sex age sibsp parch \\\n",
"0 1 McCarthy, Mr. Timothy J male 54.0 0 0 \n",
"1 1 Fortune, Mr. Mark male 64.0 1 4 \n",
"2 1 Sagesser, Mlle. Emma female 24.0 0 0 \n",
"3 3 Panula, Master. Urho Abraham male 2.0 4 1 \n",
"4 1 Maioni, Miss. Roberta female 16.0 0 0 \n",
"... ... ... ... ... ... ... \n",
"1043 2 Sobey, Mr. Samuel James Hayden male 25.0 0 0 \n",
"1044 1 Ryerson, Master. John Borie male 13.0 2 2 \n",
"1045 2 Lahtinen, Rev. William male 30.0 1 1 \n",
"1046 3 Drazenoic, Mr. Jozef male 33.0 0 0 \n",
"1047 2 Hosono, Mr. Masabumi male 42.0 0 0 \n",
"\n",
" ticket fare cabin embarked boat body \\\n",
"0 17463 51.8625 E46 S NaN 175.0 \n",
"1 19950 263.0000 C23 C25 C27 S NaN NaN \n",
"2 PC 17477 69.3000 B35 C 9 NaN \n",
"3 3101295 39.6875 NaN S NaN NaN \n",
"4 110152 86.5000 B79 S 8 NaN \n",
"... ... ... ... ... ... ... \n",
"1043 C.A. 29178 13.0000 NaN S NaN NaN \n",
"1044 PC 17608 262.3750 B57 B59 B63 B66 C 4 NaN \n",
"1045 250651 26.0000 NaN S NaN NaN \n",
"1046 349241 7.8958 NaN C NaN 51.0 \n",
"1047 237798 13.0000 NaN S 10 NaN \n",
"\n",
" home_dest \n",
"0 Dorchester, MA \n",
"1 Winnipeg, MB \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"1043 Cornwall / Houghton, MI \n",
"1044 Haverford, PA / Cooperstown, NY \n",
"1045 Minneapolis, MN \n",
"1046 Austria Niagara Falls, NY \n",
"1047 Tokyo, Japan \n",
"\n",
"[1048 rows x 13 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtrain"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" name | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" sex__male | \n",
" sex__female | \n",
" home_dest____null__ | \n",
" home_dest__New York, NY | \n",
" home_dest__London | \n",
" home_dest____default__ | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" McCarthy, Mr. Timothy J | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" 17463 | \n",
" 51.8625 | \n",
" E46 | \n",
" S | \n",
" NaN | \n",
" 175.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" 64.0 | \n",
" 1 | \n",
" 4 | \n",
" 19950 | \n",
" 263.0000 | \n",
" C23 C25 C27 | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" Sagesser, Mlle. Emma | \n",
" 24.0 | \n",
" 0 | \n",
" 0 | \n",
" PC 17477 | \n",
" 69.3000 | \n",
" B35 | \n",
" C | \n",
" 9 | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" 2.0 | \n",
" 4 | \n",
" 1 | \n",
" 3101295 | \n",
" 39.6875 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" Maioni, Miss. Roberta | \n",
" 16.0 | \n",
" 0 | \n",
" 0 | \n",
" 110152 | \n",
" 86.5000 | \n",
" B79 | \n",
" S | \n",
" 8 | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass name age sibsp parch ticket \\\n",
"0 1 McCarthy, Mr. Timothy J 54.0 0 0 17463 \n",
"1 1 Fortune, Mr. Mark 64.0 1 4 19950 \n",
"2 1 Sagesser, Mlle. Emma 24.0 0 0 PC 17477 \n",
"3 3 Panula, Master. Urho Abraham 2.0 4 1 3101295 \n",
"4 1 Maioni, Miss. Roberta 16.0 0 0 110152 \n",
"\n",
" fare cabin embarked boat body sex__male sex__female \\\n",
"0 51.8625 E46 S NaN 175.0 1 0 \n",
"1 263.0000 C23 C25 C27 S NaN NaN 1 0 \n",
"2 69.3000 B35 C 9 NaN 0 1 \n",
"3 39.6875 NaN S NaN NaN 1 0 \n",
"4 86.5000 B79 S 8 NaN 0 1 \n",
"\n",
" home_dest____null__ home_dest__New York, NY home_dest__London \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 1 0 0 \n",
"3 1 0 0 \n",
"4 1 0 0 \n",
"\n",
" home_dest____default__ \n",
"0 1 \n",
"1 1 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"])\n",
"Xencoded = encoder.fit_transform(Xtrain)\n",
"Xencoded.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Called like this the transformer encodes \"sex\" and \"home_dest\" and keeps th other columns untouched"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1048, 6)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex__male | \n",
" sex__female | \n",
" home_dest____null__ | \n",
" home_dest__New York, NY | \n",
" home_dest__London | \n",
" home_dest____default__ | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex__male sex__female home_dest____null__ home_dest__New York, NY \\\n",
"0 1 0 0 0 \n",
"1 1 0 0 0 \n",
"2 0 1 1 0 \n",
"3 1 0 1 0 \n",
"4 0 1 1 0 \n",
"\n",
" home_dest__London home_dest____default__ \n",
"0 0 1 \n",
"1 0 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"], drop_unused_columns=True)\n",
"Xencoded = encoder.fit_transform(Xtrain)\n",
"print(Xencoded.shape)\n",
"Xencoded.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Called like this the transformer encodes \"sex\" and \"home_dest\" and drop the other columns"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"New York, NY 47\n",
"London 11\n",
"Cornwall / Akron, OH 9\n",
"Winnipeg, MB 7\n",
"Montreal, PQ 7\n",
" ..\n",
"London / Birmingham 1\n",
"Folkstone, Kent / New York, NY 1\n",
"Treherbert, Cardiff, Wales 1\n",
"Devonport, England 1\n",
"Buenos Aires, Argentina / New Jersey, NJ 1\n",
"Name: home_dest, Length: 333, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtrain[\"home_dest\"].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Only the most frequent modalities are kept (this can be changed)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1048, 336)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex__male | \n",
" sex__female | \n",
" home_dest____null__ | \n",
" home_dest__New York, NY | \n",
" home_dest__London | \n",
" home_dest__Cornwall / Akron, OH | \n",
" home_dest__Winnipeg, MB | \n",
" home_dest__Montreal, PQ | \n",
" home_dest__Philadelphia, PA | \n",
" home_dest__Paris, France | \n",
" ... | \n",
" home_dest__Deer Lodge, MT | \n",
" home_dest__Bristol, England / New Britain, CT | \n",
" home_dest__Holley, NY | \n",
" home_dest__Bryn Mawr, PA, USA | \n",
" home_dest__Tokyo, Japan | \n",
" home_dest__Oslo, Norway Cameron, WI | \n",
" home_dest__Cambridge, MA | \n",
" home_dest__Ireland Brooklyn, NY | \n",
" home_dest__England | \n",
" home_dest__Aughnacliff, Co Longford, Ireland New York, NY | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 336 columns
\n",
"
"
],
"text/plain": [
" sex__male sex__female home_dest____null__ home_dest__New York, NY \\\n",
"0 1 0 0 0 \n",
"1 1 0 0 0 \n",
"2 0 1 1 0 \n",
"3 1 0 1 0 \n",
"4 0 1 1 0 \n",
"\n",
" home_dest__London home_dest__Cornwall / Akron, OH \\\n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" home_dest__Winnipeg, MB home_dest__Montreal, PQ \\\n",
"0 0 0 \n",
"1 1 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" home_dest__Philadelphia, PA home_dest__Paris, France ... \\\n",
"0 0 0 ... \n",
"1 0 0 ... \n",
"2 0 0 ... \n",
"3 0 0 ... \n",
"4 0 0 ... \n",
"\n",
" home_dest__Deer Lodge, MT home_dest__Bristol, England / New Britain, CT \\\n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" home_dest__Holley, NY home_dest__Bryn Mawr, PA, USA \\\n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" home_dest__Tokyo, Japan home_dest__Oslo, Norway Cameron, WI \\\n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" home_dest__Cambridge, MA home_dest__Ireland Brooklyn, NY \\\n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" home_dest__England \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" home_dest__Aughnacliff, Co Longford, Ireland New York, NY \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 336 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"],\n",
" drop_unused_columns=True,\n",
" min_modalities_number=400)\n",
"Xencoded = encoder.fit_transform(Xtrain)\n",
"print(Xencoded.shape)\n",
"Xencoded.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### If I specify 'min_modalities_number': 400, all the modalities are kept.\n",
"I'll start filtering the modalities only if I have more than 400)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1048, 19)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home_dest | \n",
" sex__male | \n",
" sex__female | \n",
" home_dest____null__ | \n",
" home_dest__New York, NY | \n",
" home_dest__London | \n",
" home_dest____default__ | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" McCarthy, Mr. Timothy J | \n",
" male | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" 17463 | \n",
" 51.8625 | \n",
" E46 | \n",
" S | \n",
" NaN | \n",
" 175.0 | \n",
" Dorchester, MA | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Fortune, Mr. Mark | \n",
" male | \n",
" 64.0 | \n",
" 1 | \n",
" 4 | \n",
" 19950 | \n",
" 263.0000 | \n",
" C23 C25 C27 | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" Winnipeg, MB | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" Sagesser, Mlle. Emma | \n",
" female | \n",
" 24.0 | \n",
" 0 | \n",
" 0 | \n",
" PC 17477 | \n",
" 69.3000 | \n",
" B35 | \n",
" C | \n",
" 9 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Panula, Master. Urho Abraham | \n",
" male | \n",
" 2.0 | \n",
" 4 | \n",
" 1 | \n",
" 3101295 | \n",
" 39.6875 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" Maioni, Miss. Roberta | \n",
" female | \n",
" 16.0 | \n",
" 0 | \n",
" 0 | \n",
" 110152 | \n",
" 86.5000 | \n",
" B79 | \n",
" S | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass name sex age sibsp parch ticket \\\n",
"0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 \n",
"1 1 Fortune, Mr. Mark male 64.0 1 4 19950 \n",
"2 1 Sagesser, Mlle. Emma female 24.0 0 0 PC 17477 \n",
"3 3 Panula, Master. Urho Abraham male 2.0 4 1 3101295 \n",
"4 1 Maioni, Miss. Roberta female 16.0 0 0 110152 \n",
"\n",
" fare cabin embarked boat body home_dest sex__male \\\n",
"0 51.8625 E46 S NaN 175.0 Dorchester, MA 1 \n",
"1 263.0000 C23 C25 C27 S NaN NaN Winnipeg, MB 1 \n",
"2 69.3000 B35 C 9 NaN NaN 0 \n",
"3 39.6875 NaN S NaN NaN NaN 1 \n",
"4 86.5000 B79 S 8 NaN NaN 0 \n",
"\n",
" sex__female home_dest____null__ home_dest__New York, NY \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 1 1 0 \n",
"3 0 1 0 \n",
"4 1 1 0 \n",
"\n",
" home_dest__London home_dest____default__ \n",
"0 0 1 \n",
"1 0 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = NumericalEncoder(columns_to_use=[\"sex\",\"home_dest\"], drop_used_columns=False)\n",
"Xencoded = encoder.fit_transform(Xtrain)\n",
"print(Xencoded.shape)\n",
"Xencoded.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Called like this the transformer encodes \"sex\" and \"home_dest\" but also keep them in the final result"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SVD__0 | \n",
" SVD__1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2.075079 | \n",
" -0.858158 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.332307 | \n",
" -0.970121 | \n",
"
\n",
" \n",
" 2 | \n",
" 2.279417 | \n",
" 1.340435 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.563442 | \n",
" 0.551599 | \n",
"
\n",
" \n",
" 4 | \n",
" -1.640313 | \n",
" -1.569441 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SVD__0 SVD__1\n",
"0 2.075079 -0.858158\n",
"1 0.332307 -0.970121\n",
"2 2.279417 1.340435\n",
"3 -0.563442 0.551599\n",
"4 -1.640313 -1.569441"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.transformers import TruncatedSVDWrapper\n",
"\n",
"X = pd.DataFrame(np.random.randn(100,20), columns=[f\"COL_{j}\" for j in range(20)])\n",
"\n",
"svd = TruncatedSVDWrapper(n_components=2, drop_used_columns=True)\n",
"Xencoded = svd.fit_transform(X)\n",
"Xencoded.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COL_0 | \n",
" COL_1 | \n",
" COL_2 | \n",
" COL_3 | \n",
" COL_4 | \n",
" COL_5 | \n",
" COL_6 | \n",
" COL_7 | \n",
" COL_8 | \n",
" COL_9 | \n",
" ... | \n",
" COL_12 | \n",
" COL_13 | \n",
" COL_14 | \n",
" COL_15 | \n",
" COL_16 | \n",
" COL_17 | \n",
" COL_18 | \n",
" COL_19 | \n",
" SVD__0 | \n",
" SVD__1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.858982 | \n",
" -0.655989 | \n",
" -0.028417 | \n",
" -0.357398 | \n",
" 0.569531 | \n",
" 0.145816 | \n",
" 0.552368 | \n",
" 1.983438 | \n",
" 1.092890 | \n",
" -0.453562 | \n",
" ... | \n",
" 0.285189 | \n",
" -0.604234 | \n",
" -1.053623 | \n",
" -0.291745 | \n",
" -1.646335 | \n",
" -0.215531 | \n",
" 0.008500 | \n",
" 1.100297 | \n",
" 2.076530 | \n",
" -0.854836 | \n",
"
\n",
" \n",
" 1 | \n",
" -0.187936 | \n",
" 0.041684 | \n",
" 0.941944 | \n",
" 1.898925 | \n",
" 0.179125 | \n",
" 0.636418 | \n",
" 2.050173 | \n",
" 0.229349 | \n",
" -1.910368 | \n",
" 0.702720 | \n",
" ... | \n",
" -0.533445 | \n",
" -0.371779 | \n",
" -0.401205 | \n",
" 0.231492 | \n",
" -1.043176 | \n",
" 1.842388 | \n",
" 0.329271 | \n",
" 0.882017 | \n",
" 0.346758 | \n",
" -0.951460 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.097298 | \n",
" -0.136058 | \n",
" -0.323606 | \n",
" -1.096158 | \n",
" -0.009371 | \n",
" -0.945267 | \n",
" 1.455854 | \n",
" -0.108160 | \n",
" 1.141867 | \n",
" -1.407562 | \n",
" ... | \n",
" 2.310153 | \n",
" 2.414735 | \n",
" -0.184708 | \n",
" -1.486121 | \n",
" -0.676003 | \n",
" -0.686621 | \n",
" -0.836830 | \n",
" 0.972978 | \n",
" 2.330389 | \n",
" 1.407472 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.928934 | \n",
" 0.269935 | \n",
" -1.274605 | \n",
" -0.287077 | \n",
" 0.279328 | \n",
" -0.320871 | \n",
" 0.802277 | \n",
" -0.713909 | \n",
" -1.039250 | \n",
" 1.227245 | \n",
" ... | \n",
" 0.020298 | \n",
" 0.259960 | \n",
" -0.885320 | \n",
" 0.014820 | \n",
" 0.268819 | \n",
" -0.432435 | \n",
" 1.254164 | \n",
" 0.031453 | \n",
" -0.572056 | \n",
" 0.539412 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.714467 | \n",
" 1.637883 | \n",
" -0.451313 | \n",
" 0.409956 | \n",
" 0.565926 | \n",
" 0.448906 | \n",
" -0.128214 | \n",
" -0.845320 | \n",
" 0.433473 | \n",
" -0.416148 | \n",
" ... | \n",
" 0.758863 | \n",
" -1.702709 | \n",
" -0.000005 | \n",
" -0.293631 | \n",
" -0.859405 | \n",
" -0.167067 | \n",
" 0.400996 | \n",
" -1.095900 | \n",
" -1.603850 | \n",
" -1.532443 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 22 columns
\n",
"
"
],
"text/plain": [
" COL_0 COL_1 COL_2 COL_3 COL_4 COL_5 COL_6 \\\n",
"0 0.858982 -0.655989 -0.028417 -0.357398 0.569531 0.145816 0.552368 \n",
"1 -0.187936 0.041684 0.941944 1.898925 0.179125 0.636418 2.050173 \n",
"2 1.097298 -0.136058 -0.323606 -1.096158 -0.009371 -0.945267 1.455854 \n",
"3 0.928934 0.269935 -1.274605 -0.287077 0.279328 -0.320871 0.802277 \n",
"4 -0.714467 1.637883 -0.451313 0.409956 0.565926 0.448906 -0.128214 \n",
"\n",
" COL_7 COL_8 COL_9 ... COL_12 COL_13 COL_14 COL_15 \\\n",
"0 1.983438 1.092890 -0.453562 ... 0.285189 -0.604234 -1.053623 -0.291745 \n",
"1 0.229349 -1.910368 0.702720 ... -0.533445 -0.371779 -0.401205 0.231492 \n",
"2 -0.108160 1.141867 -1.407562 ... 2.310153 2.414735 -0.184708 -1.486121 \n",
"3 -0.713909 -1.039250 1.227245 ... 0.020298 0.259960 -0.885320 0.014820 \n",
"4 -0.845320 0.433473 -0.416148 ... 0.758863 -1.702709 -0.000005 -0.293631 \n",
"\n",
" COL_16 COL_17 COL_18 COL_19 SVD__0 SVD__1 \n",
"0 -1.646335 -0.215531 0.008500 1.100297 2.076530 -0.854836 \n",
"1 -1.043176 1.842388 0.329271 0.882017 0.346758 -0.951460 \n",
"2 -0.676003 -0.686621 -0.836830 0.972978 2.330389 1.407472 \n",
"3 0.268819 -0.432435 1.254164 0.031453 -0.572056 0.539412 \n",
"4 -0.859405 -0.167067 0.400996 -1.095900 -1.603850 -1.532443 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"svd = TruncatedSVDWrapper(n_components=2, drop_used_columns=False)\n",
"Xencoded = svd.fit_transform(X)\n",
"Xencoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Another example of the usage of 'drop_used_columns' and 'drop_unused_columns':\n",
" * in the first case (drop_used_columns = True) : only the SVD columns are retrieved\n",
" * in the second case (drop_used_columns = False) : I retrive the original columns AND the svd columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}