{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting Started\n",
"\n",
"This notebook will show you how to built a complexe pipeline using aikit and how to crossvalidated it"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
pclass
\n",
"
name
\n",
"
sex
\n",
"
age
\n",
"
sibsp
\n",
"
parch
\n",
"
ticket
\n",
"
fare
\n",
"
cabin
\n",
"
embarked
\n",
"
boat
\n",
"
body
\n",
"
home_dest
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
McCarthy, Mr. Timothy J
\n",
"
male
\n",
"
54.0
\n",
"
0
\n",
"
0
\n",
"
17463
\n",
"
51.8625
\n",
"
E46
\n",
"
S
\n",
"
NaN
\n",
"
175.0
\n",
"
Dorchester, MA
\n",
"
\n",
"
\n",
"
1
\n",
"
1
\n",
"
Fortune, Mr. Mark
\n",
"
male
\n",
"
64.0
\n",
"
1
\n",
"
4
\n",
"
19950
\n",
"
263.0000
\n",
"
C23 C25 C27
\n",
"
S
\n",
"
NaN
\n",
"
NaN
\n",
"
Winnipeg, MB
\n",
"
\n",
"
\n",
"
2
\n",
"
1
\n",
"
Sagesser, Mlle. Emma
\n",
"
female
\n",
"
24.0
\n",
"
0
\n",
"
0
\n",
"
PC 17477
\n",
"
69.3000
\n",
"
B35
\n",
"
C
\n",
"
9
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
3
\n",
"
3
\n",
"
Panula, Master. Urho Abraham
\n",
"
male
\n",
"
2.0
\n",
"
4
\n",
"
1
\n",
"
3101295
\n",
"
39.6875
\n",
"
NaN
\n",
"
S
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
4
\n",
"
1
\n",
"
Maioni, Miss. Roberta
\n",
"
female
\n",
"
16.0
\n",
"
0
\n",
"
0
\n",
"
110152
\n",
"
86.5000
\n",
"
B79
\n",
"
S
\n",
"
8
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
5
\n",
"
3
\n",
"
Waelens, Mr. Achille
\n",
"
male
\n",
"
22.0
\n",
"
0
\n",
"
0
\n",
"
345767
\n",
"
9.0000
\n",
"
NaN
\n",
"
S
\n",
"
NaN
\n",
"
NaN
\n",
"
Antwerp, Belgium / Stanton, OH
\n",
"
\n",
"
\n",
"
6
\n",
"
3
\n",
"
Reed, Mr. James George
\n",
"
male
\n",
"
NaN
\n",
"
0
\n",
"
0
\n",
"
362316
\n",
"
7.2500
\n",
"
NaN
\n",
"
S
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
7
\n",
"
1
\n",
"
Swift, Mrs. Frederick Joel (Margaret Welles Ba...
\n",
"
female
\n",
"
48.0
\n",
"
0
\n",
"
0
\n",
"
17466
\n",
"
25.9292
\n",
"
D17
\n",
"
S
\n",
"
8
\n",
"
NaN
\n",
"
Brooklyn, NY
\n",
"
\n",
"
\n",
"
8
\n",
"
1
\n",
"
Smith, Mrs. Lucien Philip (Mary Eloise Hughes)
\n",
"
female
\n",
"
18.0
\n",
"
1
\n",
"
0
\n",
"
13695
\n",
"
60.0000
\n",
"
C31
\n",
"
S
\n",
"
6
\n",
"
NaN
\n",
"
Huntington, WV
\n",
"
\n",
"
\n",
"
9
\n",
"
1
\n",
"
Rowe, Mr. Alfred G
\n",
"
male
\n",
"
33.0
\n",
"
0
\n",
"
0
\n",
"
113790
\n",
"
26.5500
\n",
"
NaN
\n",
"
S
\n",
"
NaN
\n",
"
109.0
\n",
"
London
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass name sex age \\\n",
"0 1 McCarthy, Mr. Timothy J male 54.0 \n",
"1 1 Fortune, Mr. Mark male 64.0 \n",
"2 1 Sagesser, Mlle. Emma female 24.0 \n",
"3 3 Panula, Master. Urho Abraham male 2.0 \n",
"4 1 Maioni, Miss. Roberta female 16.0 \n",
"5 3 Waelens, Mr. Achille male 22.0 \n",
"6 3 Reed, Mr. James George male NaN \n",
"7 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 \n",
"8 1 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) female 18.0 \n",
"9 1 Rowe, Mr. Alfred G male 33.0 \n",
"\n",
" sibsp parch ticket fare cabin embarked boat body \\\n",
"0 0 0 17463 51.8625 E46 S NaN 175.0 \n",
"1 1 4 19950 263.0000 C23 C25 C27 S NaN NaN \n",
"2 0 0 PC 17477 69.3000 B35 C 9 NaN \n",
"3 4 1 3101295 39.6875 NaN S NaN NaN \n",
"4 0 0 110152 86.5000 B79 S 8 NaN \n",
"5 0 0 345767 9.0000 NaN S NaN NaN \n",
"6 0 0 362316 7.2500 NaN S NaN NaN \n",
"7 0 0 17466 25.9292 D17 S 8 NaN \n",
"8 1 0 13695 60.0000 C31 S 6 NaN \n",
"9 0 0 113790 26.5500 NaN S NaN 109.0 \n",
"\n",
" home_dest \n",
"0 Dorchester, MA \n",
"1 Winnipeg, MB \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"5 Antwerp, Belgium / Stanton, OH \n",
"6 NaN \n",
"7 Brooklyn, NY \n",
"8 Huntington, WV \n",
"9 London "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.datasets.datasets import load_dataset, DatasetEnum\n",
"Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)\n",
"Xtrain.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 1, 0, 1, 0, 0, 1, 1, 0], dtype=int64)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train[0:10]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\r\n",
"\r\n",
"\r\n",
"\r\n",
"\r\n"
],
"text/plain": [
""
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aikit.pipeline import GraphPipeline\n",
"from aikit.transformers import ColumnsSelector, NumericalEncoder, NumImputer, CountVectorizerWrapper\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"text_cols = [\"name\",\"ticket\"]\n",
"non_text_cols = [c for c in Xtrain.columns if c not in text_cols]\n",
"\n",
"gpipeline = GraphPipeline(models = {\n",
" \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
" \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
" \"imp\":NumImputer(),\n",
" \"vect\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
" \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
" },\n",
" edges = [(\"sel\",\"enc\",\"imp\",\"rf\"),(\"vect\",\"rf\")])\n",
"\n",
"gpipeline.fit(Xtrain,y_train)\n",
"gpipeline.graphviz"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"cv 0 started\n",
"\n",
"cv 1 started\n",
"\n",
"cv 2 started\n",
"\n",
"cv 3 started\n",
"\n",
"cv 4 started\n",
"\n",
"cv 5 started\n",
"\n",
"cv 6 started\n",
"\n",
"cv 7 started\n",
"\n",
"cv 8 started\n",
"\n",
"cv 9 started\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 13.3s finished\n"
]
},
{
"data": {
"text/html": [
"