{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GraphPipeline getting started ###\n",
    "This notebook is here to show a few things that can be done by the package.\n",
    "\n",
    "It doesn't means that these are the things you should do on that particular dataset.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's load titanic dataset to test a few things"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore') # to remove gensim warning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from aikit.datasets.datasets import load_dataset, DatasetEnum\n",
    "Xtrain, y_train, _ ,_ , _ = load_dataset(DatasetEnum.titanic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>ticket</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "      <th>boat</th>\n",
       "      <th>body</th>\n",
       "      <th>home_dest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>McCarthy, Mr. Timothy J</td>\n",
       "      <td>male</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>17463</td>\n",
       "      <td>51.8625</td>\n",
       "      <td>E46</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>175.0</td>\n",
       "      <td>Dorchester, MA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Fortune, Mr. Mark</td>\n",
       "      <td>male</td>\n",
       "      <td>64.0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>19950</td>\n",
       "      <td>263.0000</td>\n",
       "      <td>C23 C25 C27</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Winnipeg, MB</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>Sagesser, Mlle. Emma</td>\n",
       "      <td>female</td>\n",
       "      <td>24.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17477</td>\n",
       "      <td>69.3000</td>\n",
       "      <td>B35</td>\n",
       "      <td>C</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Panula, Master. Urho Abraham</td>\n",
       "      <td>male</td>\n",
       "      <td>2.0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>3101295</td>\n",
       "      <td>39.6875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>Maioni, Miss. Roberta</td>\n",
       "      <td>female</td>\n",
       "      <td>16.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>110152</td>\n",
       "      <td>86.5000</td>\n",
       "      <td>B79</td>\n",
       "      <td>S</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>Waelens, Mr. Achille</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>345767</td>\n",
       "      <td>9.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Antwerp, Belgium / Stanton, OH</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3</td>\n",
       "      <td>Reed, Mr. James George</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>362316</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>Swift, Mrs. Frederick Joel (Margaret Welles Ba...</td>\n",
       "      <td>female</td>\n",
       "      <td>48.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>17466</td>\n",
       "      <td>25.9292</td>\n",
       "      <td>D17</td>\n",
       "      <td>S</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Brooklyn, NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>Smith, Mrs. Lucien Philip (Mary Eloise Hughes)</td>\n",
       "      <td>female</td>\n",
       "      <td>18.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>13695</td>\n",
       "      <td>60.0000</td>\n",
       "      <td>C31</td>\n",
       "      <td>S</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Huntington, WV</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>Rowe, Mr. Alfred G</td>\n",
       "      <td>male</td>\n",
       "      <td>33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113790</td>\n",
       "      <td>26.5500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>109.0</td>\n",
       "      <td>London</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>3</td>\n",
       "      <td>Meo, Mr. Alfonzo</td>\n",
       "      <td>male</td>\n",
       "      <td>55.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>A.5. 11206</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>201.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>3</td>\n",
       "      <td>Abbott, Mr. Rossmore Edward</td>\n",
       "      <td>male</td>\n",
       "      <td>16.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>C.A. 2673</td>\n",
       "      <td>20.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>190.0</td>\n",
       "      <td>East Providence, RI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>3</td>\n",
       "      <td>Elias, Mr. Dibo</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2674</td>\n",
       "      <td>7.2250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2</td>\n",
       "      <td>Reynaldo, Ms. Encarnacion</td>\n",
       "      <td>female</td>\n",
       "      <td>28.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>230434</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Spain</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>3</td>\n",
       "      <td>Khalil, Mr. Betros</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2660</td>\n",
       "      <td>14.4542</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "      <td>Daniels, Miss. Sarah</td>\n",
       "      <td>female</td>\n",
       "      <td>33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>3</td>\n",
       "      <td>Ford, Miss. Robina Maggie 'Ruby'</td>\n",
       "      <td>female</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6608</td>\n",
       "      <td>34.3750</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Rotherfield, Sussex, England Essex Co, MA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>3</td>\n",
       "      <td>Thorneycroft, Mrs. Percival (Florence Kate White)</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>376564</td>\n",
       "      <td>16.1000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>3</td>\n",
       "      <td>Lennon, Mr. Denis</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>370371</td>\n",
       "      <td>15.5000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>3</td>\n",
       "      <td>de Pelsmaeker, Mr. Alfons</td>\n",
       "      <td>male</td>\n",
       "      <td>16.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>345778</td>\n",
       "      <td>9.5000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    pclass                                               name     sex   age  \\\n",
       "0        1                            McCarthy, Mr. Timothy J    male  54.0   \n",
       "1        1                                  Fortune, Mr. Mark    male  64.0   \n",
       "2        1                               Sagesser, Mlle. Emma  female  24.0   \n",
       "3        3                       Panula, Master. Urho Abraham    male   2.0   \n",
       "4        1                              Maioni, Miss. Roberta  female  16.0   \n",
       "5        3                               Waelens, Mr. Achille    male  22.0   \n",
       "6        3                             Reed, Mr. James George    male   NaN   \n",
       "7        1  Swift, Mrs. Frederick Joel (Margaret Welles Ba...  female  48.0   \n",
       "8        1     Smith, Mrs. Lucien Philip (Mary Eloise Hughes)  female  18.0   \n",
       "9        1                                 Rowe, Mr. Alfred G    male  33.0   \n",
       "10       3                                   Meo, Mr. Alfonzo    male  55.5   \n",
       "11       3                        Abbott, Mr. Rossmore Edward    male  16.0   \n",
       "12       3                                    Elias, Mr. Dibo    male   NaN   \n",
       "13       2                          Reynaldo, Ms. Encarnacion  female  28.0   \n",
       "14       3                                 Khalil, Mr. Betros    male   NaN   \n",
       "15       1                               Daniels, Miss. Sarah  female  33.0   \n",
       "16       3                   Ford, Miss. Robina Maggie 'Ruby'  female   9.0   \n",
       "17       3  Thorneycroft, Mrs. Percival (Florence Kate White)  female   NaN   \n",
       "18       3                                  Lennon, Mr. Denis    male   NaN   \n",
       "19       3                          de Pelsmaeker, Mr. Alfons    male  16.0   \n",
       "\n",
       "    sibsp  parch      ticket      fare        cabin embarked boat   body  \\\n",
       "0       0      0       17463   51.8625          E46        S  NaN  175.0   \n",
       "1       1      4       19950  263.0000  C23 C25 C27        S  NaN    NaN   \n",
       "2       0      0    PC 17477   69.3000          B35        C    9    NaN   \n",
       "3       4      1     3101295   39.6875          NaN        S  NaN    NaN   \n",
       "4       0      0      110152   86.5000          B79        S    8    NaN   \n",
       "5       0      0      345767    9.0000          NaN        S  NaN    NaN   \n",
       "6       0      0      362316    7.2500          NaN        S  NaN    NaN   \n",
       "7       0      0       17466   25.9292          D17        S    8    NaN   \n",
       "8       1      0       13695   60.0000          C31        S    6    NaN   \n",
       "9       0      0      113790   26.5500          NaN        S  NaN  109.0   \n",
       "10      0      0  A.5. 11206    8.0500          NaN        S  NaN  201.0   \n",
       "11      1      1   C.A. 2673   20.2500          NaN        S  NaN  190.0   \n",
       "12      0      0        2674    7.2250          NaN        C  NaN    NaN   \n",
       "13      0      0      230434   13.0000          NaN        S    9    NaN   \n",
       "14      1      0        2660   14.4542          NaN        C  NaN    NaN   \n",
       "15      0      0      113781  151.5500          NaN        S    8    NaN   \n",
       "16      2      2  W./C. 6608   34.3750          NaN        S  NaN    NaN   \n",
       "17      1      0      376564   16.1000          NaN        S   10    NaN   \n",
       "18      1      0      370371   15.5000          NaN        Q  NaN    NaN   \n",
       "19      0      0      345778    9.5000          NaN        S  NaN    NaN   \n",
       "\n",
       "                                    home_dest  \n",
       "0                              Dorchester, MA  \n",
       "1                                Winnipeg, MB  \n",
       "2                                         NaN  \n",
       "3                                         NaN  \n",
       "4                                         NaN  \n",
       "5              Antwerp, Belgium / Stanton, OH  \n",
       "6                                         NaN  \n",
       "7                                Brooklyn, NY  \n",
       "8                              Huntington, WV  \n",
       "9                                      London  \n",
       "10                                        NaN  \n",
       "11                        East Providence, RI  \n",
       "12                                        NaN  \n",
       "13                                      Spain  \n",
       "14                                        NaN  \n",
       "15                                        NaN  \n",
       "16  Rotherfield, Sussex, England Essex Co, MA  \n",
       "17                                        NaN  \n",
       "18                                        NaN  \n",
       "19                                        NaN  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Xtrain.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],\n",
       "      dtype=int64)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train[0:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For now let's ignore the Name and Ticket column which should probably be handled as text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Matplotlib won't work\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from aikit.transformers import TruncatedSVDWrapper, NumImputer, CountVectorizerWrapper, NumericalEncoder\n",
    "from aikit.pipeline import GraphPipeline\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pclass',\n",
       " 'sex',\n",
       " 'age',\n",
       " 'sibsp',\n",
       " 'parch',\n",
       " 'fare',\n",
       " 'cabin',\n",
       " 'embarked',\n",
       " 'boat',\n",
       " 'body',\n",
       " 'home_dest']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "non_text_cols = [c for c in Xtrain.columns if c not in (\"ticket\",\"name\")] # everything that is not text\n",
    "non_text_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"72pt\" height=\"188pt\"\r\n",
       " viewBox=\"0.00 0.00 71.69 188.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 184)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-184 67.6943,-184 67.6943,4 -4,4\"/>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- forest -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>forest</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-18\" rx=\"31.6951\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">forest</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;forest -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>imp&#45;&gt;forest</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M31.8472,-71.6966C31.8472,-63.9827 31.8472,-54.7125 31.8472,-46.1124\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"35.3473,-46.1043 31.8472,-36.1043 28.3473,-46.1044 35.3473,-46.1043\"/>\r\n",
       "</g>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M31.8472,-143.697C31.8472,-135.983 31.8472,-126.712 31.8472,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"35.3473,-118.104 31.8472,-108.104 28.3473,-118.104 35.3473,-118.104\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x155475ef4a8>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline = GraphPipeline(models = { \"enc\":NumericalEncoder(),\n",
    "                                     \"imp\":NumImputer(),\n",
    "                                     \"forest\":RandomForestClassifier(n_estimators=100)\n",
    "                                   },\n",
    "                          edges = [(\"enc\",\"imp\",\"forest\")])\n",
    "\n",
    "gpipeline.fit(Xtrain.loc[:,non_text_cols],y_train)\n",
    "gpipeline.graphviz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Let's do a cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.8s finished\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>test_roc_auc</th>\n",
       "      <th>test_accuracy</th>\n",
       "      <th>test_neg_log_loss</th>\n",
       "      <th>train_roc_auc</th>\n",
       "      <th>train_accuracy</th>\n",
       "      <th>train_neg_log_loss</th>\n",
       "      <th>fit_time</th>\n",
       "      <th>score_time</th>\n",
       "      <th>n_test_samples</th>\n",
       "      <th>fold_nb</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.997332</td>\n",
       "      <td>0.990476</td>\n",
       "      <td>-0.050391</td>\n",
       "      <td>0.999830</td>\n",
       "      <td>0.995758</td>\n",
       "      <td>-0.029559</td>\n",
       "      <td>0.200567</td>\n",
       "      <td>0.076803</td>\n",
       "      <td>105</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.968369</td>\n",
       "      <td>0.961905</td>\n",
       "      <td>-0.723250</td>\n",
       "      <td>0.999986</td>\n",
       "      <td>0.997879</td>\n",
       "      <td>-0.022651</td>\n",
       "      <td>0.192597</td>\n",
       "      <td>0.066856</td>\n",
       "      <td>105</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.983232</td>\n",
       "      <td>0.942857</td>\n",
       "      <td>-0.154483</td>\n",
       "      <td>0.999816</td>\n",
       "      <td>0.995758</td>\n",
       "      <td>-0.026256</td>\n",
       "      <td>0.200526</td>\n",
       "      <td>0.069852</td>\n",
       "      <td>105</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.035742</td>\n",
       "      <td>0.999707</td>\n",
       "      <td>0.995758</td>\n",
       "      <td>-0.030825</td>\n",
       "      <td>0.210022</td>\n",
       "      <td>0.070714</td>\n",
       "      <td>105</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.996380</td>\n",
       "      <td>0.961905</td>\n",
       "      <td>-0.088300</td>\n",
       "      <td>0.999802</td>\n",
       "      <td>0.995758</td>\n",
       "      <td>-0.028642</td>\n",
       "      <td>0.199074</td>\n",
       "      <td>0.064868</td>\n",
       "      <td>105</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.991806</td>\n",
       "      <td>0.952381</td>\n",
       "      <td>-0.125793</td>\n",
       "      <td>0.999797</td>\n",
       "      <td>0.997879</td>\n",
       "      <td>-0.025816</td>\n",
       "      <td>0.193644</td>\n",
       "      <td>0.070361</td>\n",
       "      <td>105</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.040940</td>\n",
       "      <td>0.999703</td>\n",
       "      <td>0.995758</td>\n",
       "      <td>-0.029609</td>\n",
       "      <td>0.215009</td>\n",
       "      <td>0.065831</td>\n",
       "      <td>105</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.996380</td>\n",
       "      <td>0.980952</td>\n",
       "      <td>-0.088508</td>\n",
       "      <td>0.999842</td>\n",
       "      <td>0.996819</td>\n",
       "      <td>-0.026614</td>\n",
       "      <td>0.184709</td>\n",
       "      <td>0.077791</td>\n",
       "      <td>105</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.992838</td>\n",
       "      <td>0.971154</td>\n",
       "      <td>-0.107017</td>\n",
       "      <td>0.999793</td>\n",
       "      <td>0.995763</td>\n",
       "      <td>-0.027610</td>\n",
       "      <td>0.187533</td>\n",
       "      <td>0.063796</td>\n",
       "      <td>104</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.999613</td>\n",
       "      <td>0.980769</td>\n",
       "      <td>-0.072026</td>\n",
       "      <td>0.999764</td>\n",
       "      <td>0.995763</td>\n",
       "      <td>-0.028887</td>\n",
       "      <td>0.199505</td>\n",
       "      <td>0.062847</td>\n",
       "      <td>104</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   test_roc_auc  test_accuracy  test_neg_log_loss  train_roc_auc  \\\n",
       "0      0.997332       0.990476          -0.050391       0.999830   \n",
       "1      0.968369       0.961905          -0.723250       0.999986   \n",
       "2      0.983232       0.942857          -0.154483       0.999816   \n",
       "3      1.000000       1.000000          -0.035742       0.999707   \n",
       "4      0.996380       0.961905          -0.088300       0.999802   \n",
       "5      0.991806       0.952381          -0.125793       0.999797   \n",
       "6      1.000000       1.000000          -0.040940       0.999703   \n",
       "7      0.996380       0.980952          -0.088508       0.999842   \n",
       "8      0.992838       0.971154          -0.107017       0.999793   \n",
       "9      0.999613       0.980769          -0.072026       0.999764   \n",
       "\n",
       "   train_accuracy  train_neg_log_loss  fit_time  score_time  n_test_samples  \\\n",
       "0        0.995758           -0.029559  0.200567    0.076803             105   \n",
       "1        0.997879           -0.022651  0.192597    0.066856             105   \n",
       "2        0.995758           -0.026256  0.200526    0.069852             105   \n",
       "3        0.995758           -0.030825  0.210022    0.070714             105   \n",
       "4        0.995758           -0.028642  0.199074    0.064868             105   \n",
       "5        0.997879           -0.025816  0.193644    0.070361             105   \n",
       "6        0.995758           -0.029609  0.215009    0.065831             105   \n",
       "7        0.996819           -0.026614  0.184709    0.077791             105   \n",
       "8        0.995763           -0.027610  0.187533    0.063796             104   \n",
       "9        0.995763           -0.028887  0.199505    0.062847             104   \n",
       "\n",
       "   fold_nb  \n",
       "0        0  \n",
       "1        1  \n",
       "2        2  \n",
       "3        3  \n",
       "4        4  \n",
       "5        5  \n",
       "6        6  \n",
       "7        7  \n",
       "8        8  \n",
       "9        9  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from aikit.cross_validation import cross_validation\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "cv = StratifiedKFold(10, random_state=123, shuffle=True)\n",
    "\n",
    "cv_result = cross_validation(gpipeline, Xtrain.loc[:,non_text_cols], y_train,cv = cv,\n",
    "                             scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This cross-validate the complete Pipeline. The difference with sklearn function is that :\n",
    "* you can score more than one metric at a time\n",
    "* you retrieve train and test score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.992595\n",
       "test_accuracy        0.974240\n",
       "test_neg_log_loss   -0.148645\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can do the same but selecting the columns directly in the pipeline :"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"72pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 71.69 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 67.6943,-256 67.6943,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M31.8472,-143.697C31.8472,-135.983 31.8472,-126.712 31.8472,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"35.3473,-118.104 31.8472,-108.104 28.3473,-118.104 35.3473,-118.104\"/>\r\n",
       "</g>\r\n",
       "<!-- forest -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>forest</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-18\" rx=\"31.6951\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">forest</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;forest -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;forest</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M31.8472,-71.6966C31.8472,-63.9827 31.8472,-54.7125 31.8472,-46.1124\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"35.3473,-46.1043 31.8472,-36.1043 28.3473,-46.1044 35.3473,-46.1043\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"31.8472\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"31.8472\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M31.8472,-215.697C31.8472,-207.983 31.8472,-198.712 31.8472,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"35.3473,-190.104 31.8472,-180.104 28.3473,-190.104 35.3473,-190.104\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x155504d99b0>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from aikit.transformers import ColumnsSelector\n",
    "gpipeline2 = GraphPipeline(models = { \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "                                      \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "                                      \"imp\":NumImputer(),\n",
    "                                      \"forest\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                                    },\n",
    "                         edges = [(\"sel\",\"enc\",\"imp\",\"forest\")])\n",
    "\n",
    "gpipeline2.fit(Xtrain,y_train)\n",
    "gpipeline2.graphviz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Remark : 'columns_to_use=\"object\"' tells aikit to encode the columns of type object, it will keep the rest untouched"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.0s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.991698\n",
       "test_accuracy        0.972335\n",
       "test_neg_log_loss   -0.178280\n",
       "dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline2,Xtrain,y_train,cv = cv,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's see what we can do with the columns we excluded. We could craft features from them, but let's try to use them as text directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizerWrapper(analyzer='word', column_prefix='BAG',\n",
       "                       columns_to_use=['ticket', 'name'],\n",
       "                       desired_output_type='SparseArray',\n",
       "                       drop_unused_columns=True, drop_used_columns=True,\n",
       "                       max_df=1.0, max_features=None, min_df=1, ngram_range=1,\n",
       "                       regex_match=False, tfidf=False, vocabulary=None)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_cols = [\"ticket\",\"name\"]\n",
    "vect = CountVectorizerWrapper(analyzer=\"word\", columns_to_use=text_cols)\n",
    "vect.fit(Xtrain,y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Remark : aikit CountVectorizer can direcly work on 2 (or more) columns, no need to use a FeatureUnion or something of the sort"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ticket__BAG__10482',\n",
       " 'ticket__BAG__110152',\n",
       " 'ticket__BAG__110413',\n",
       " 'ticket__BAG__110465',\n",
       " 'ticket__BAG__110469',\n",
       " 'ticket__BAG__110489',\n",
       " 'ticket__BAG__110564',\n",
       " 'ticket__BAG__110813',\n",
       " 'ticket__BAG__111163',\n",
       " 'ticket__BAG__111240',\n",
       " 'ticket__BAG__111320',\n",
       " 'ticket__BAG__111361',\n",
       " 'ticket__BAG__111369',\n",
       " 'ticket__BAG__111426',\n",
       " 'ticket__BAG__111427',\n",
       " 'ticket__BAG__112050',\n",
       " 'ticket__BAG__112052',\n",
       " 'ticket__BAG__112053',\n",
       " 'ticket__BAG__112058',\n",
       " 'ticket__BAG__11206',\n",
       " '...',\n",
       " 'name__BAG__woolf',\n",
       " 'name__BAG__woolner',\n",
       " 'name__BAG__worth',\n",
       " 'name__BAG__wright',\n",
       " 'name__BAG__wyckoff',\n",
       " 'name__BAG__yarred',\n",
       " 'name__BAG__yasbeck',\n",
       " 'name__BAG__ylio',\n",
       " 'name__BAG__yoto',\n",
       " 'name__BAG__young',\n",
       " 'name__BAG__youseff',\n",
       " 'name__BAG__yousif',\n",
       " 'name__BAG__youssef',\n",
       " 'name__BAG__yousseff',\n",
       " 'name__BAG__yrois',\n",
       " 'name__BAG__zabour',\n",
       " 'name__BAG__zakarian',\n",
       " 'name__BAG__zebley',\n",
       " 'name__BAG__zenni',\n",
       " 'name__BAG__zillah']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = vect.get_feature_names()\n",
    "features[0:20] + [\"...\"] + features[-20:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The encoder directly encodes the 2 features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048x2440 sparse matrix of type '<class 'numpy.int32'>'\n",
       "\twith 5414 stored elements in COOrdinate format>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xx_res = vect.transform(Xtrain)\n",
    "xx_res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Again let's create a GraphPipeline to cross-validate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"62pt\" height=\"116pt\"\r\n",
       " viewBox=\"0.00 0.00 62.00 116.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 112)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-112 58,-112 58,4 -4,4\"/>\r\n",
       "<!-- vect -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>vect</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect</text>\r\n",
       "</g>\r\n",
       "<!-- logit -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>logit</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">logit</text>\r\n",
       "</g>\r\n",
       "<!-- vect&#45;&gt;logit -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>vect&#45;&gt;logit</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M27,-71.6966C27,-63.9827 27,-54.7125 27,-46.1124\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"30.5001,-46.1043 27,-36.1043 23.5001,-46.1044 30.5001,-46.1043\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x1555056c5f8>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline3 = GraphPipeline(models = {\"vect\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "                                     \"logit\":LogisticRegression(solver=\"liblinear\", random_state=123)},\n",
    "                           edges=[(\"vect\",\"logit\")])\n",
    "gpipeline3.fit(Xtrain,y_train)\n",
    "gpipeline3.graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n",
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n",
      "cv 3 started\n",
      "\n",
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n",
      "cv 6 started\n",
      "\n",
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n",
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.850918\n",
       "test_accuracy        0.819679\n",
       "test_neg_log_loss   -0.451681\n",
       "dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline3, Xtrain,y_train,cv = cv,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can also try we \"bag of char\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"62pt\" height=\"116pt\"\r\n",
       " viewBox=\"0.00 0.00 62.00 116.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 112)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-112 58,-112 58,4 -4,4\"/>\r\n",
       "<!-- vect -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>vect</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect</text>\r\n",
       "</g>\r\n",
       "<!-- logit -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>logit</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">logit</text>\r\n",
       "</g>\r\n",
       "<!-- vect&#45;&gt;logit -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>vect&#45;&gt;logit</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M27,-71.6966C27,-63.9827 27,-54.7125 27,-46.1124\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"30.5001,-46.1043 27,-36.1043 23.5001,-46.1044 30.5001,-46.1043\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x155506accf8>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline4 = GraphPipeline(models = {\n",
    "        \"vect\": CountVectorizerWrapper(analyzer=\"char\",ngram_range=(1,4),columns_to_use=text_cols),\n",
    "        \"logit\": LogisticRegression(solver=\"liblinear\", random_state=123) }, edges=[(\"vect\",\"logit\")])\n",
    "gpipeline4.fit(Xtrain,y_train)\n",
    "gpipeline4.graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.9s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.849773\n",
       "test_accuracy        0.813956\n",
       "test_neg_log_loss   -0.559254\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline4,Xtrain,y_train,cv = cv,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#####   Now let's use all the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"134pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 134.00 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 130,-256 130,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"99\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"99\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"99\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"99\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M99,-143.697C99,-135.983 99,-126.712 99,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"102.5,-118.104 99,-108.104 95.5001,-118.104 102.5,-118.104\"/>\r\n",
       "</g>\r\n",
       "<!-- rf -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>rf</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"63\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"63\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">rf</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;rf -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M90.6504,-72.7646C86.2885,-64.2831 80.8531,-53.7144 75.9587,-44.1974\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"78.9904,-42.4395 71.3043,-35.1473 72.7654,-45.6409 78.9904,-42.4395\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"99\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"99\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M99,-215.697C99,-207.983 99,-198.712 99,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"102.5,-190.104 99,-180.104 95.5001,-190.104 102.5,-190.104\"/>\r\n",
       "</g>\r\n",
       "<!-- vect -->\r\n",
       "<g id=\"node5\" class=\"node\"><title>vect</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect</text>\r\n",
       "</g>\r\n",
       "<!-- vect&#45;&gt;rf -->\r\n",
       "<g id=\"edge4\" class=\"edge\"><title>vect&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M35.3496,-72.7646C39.7115,-64.2831 45.1469,-53.7144 50.0413,-44.1974\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"53.2346,-45.6409 54.6957,-35.1473 47.0096,-42.4395 53.2346,-45.6409\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x155506ca780>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline5 = GraphPipeline(models = {\n",
    "    \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "    \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "    \"imp\":NumImputer(),\n",
    "    \"vect\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "    \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                       },\n",
    "              edges = [(\"sel\",\"enc\",\"imp\",\"rf\"),(\"vect\",\"rf\")])\n",
    "gpipeline5.fit(Xtrain,y_train)\n",
    "gpipeline5.graphviz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This model uses both set of columns:\n",
    "* bag of word\n",
    "* and categorical/numerical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   11.0s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.992779\n",
       "test_accuracy        0.968507\n",
       "test_neg_log_loss   -0.173236\n",
       "dtype: float64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline5,Xtrain,y_train,cv = cv,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<b>We can also use both Bag of Char and Bag of Word </b>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"290pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 290.15 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 286.146,-256 286.146,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.146\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.146\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.146\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.146\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M255.146,-143.697C255.146,-135.983 255.146,-126.712 255.146,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"258.646,-118.104 255.146,-108.104 251.646,-118.104 258.646,-118.104\"/>\r\n",
       "</g>\r\n",
       "<!-- rf -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>rf</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"160.146\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"160.146\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">rf</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;rf -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M237.721,-76.1609C223.216,-65.4727 202.354,-50.1005 185.917,-37.989\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"187.908,-35.1091 177.782,-31.9947 183.756,-40.7445 187.908,-35.1091\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.146\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.146\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M255.146,-215.697C255.146,-207.983 255.146,-198.712 255.146,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"258.646,-190.104 255.146,-180.104 251.646,-190.104 258.646,-190.104\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char -->\r\n",
       "<g id=\"node5\" class=\"node\"><title>vect_char</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"46.1459\" cy=\"-90\" rx=\"46.2923\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"46.1459\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_char</text>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;rf -->\r\n",
       "<g id=\"edge4\" class=\"edge\"><title>vect_char&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M69.7668,-74.496C87.8957,-63.3642 112.98,-47.9615 132.123,-36.2067\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"134.03,-39.1431 140.721,-30.9278 130.367,-33.1779 134.03,-39.1431\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_word -->\r\n",
       "<g id=\"node6\" class=\"node\"><title>vect_word</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"160.146\" cy=\"-90\" rx=\"50.0912\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"160.146\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_word</text>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;rf -->\r\n",
       "<g id=\"edge5\" class=\"edge\"><title>vect_word&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M160.146,-71.6966C160.146,-63.9827 160.146,-54.7125 160.146,-46.1124\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"163.646,-46.1043 160.146,-36.1043 156.646,-46.1044 163.646,-46.1043\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x15550b65f98>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline6 = GraphPipeline(models = {\n",
    "    \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "    \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "    \"imp\":NumImputer(),\n",
    "    \"vect_char\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "    \"vect_word\":CountVectorizerWrapper(analyzer=\"char\",ngram_range=(1,4),columns_to_use=text_cols),\n",
    "    \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                       },\n",
    "              edges = [(\"sel\",\"enc\",\"imp\",\"rf\"),(\"vect_char\",\"rf\"),(\"vect_word\",\"rf\")])\n",
    "gpipeline6.fit(Xtrain,y_train)\n",
    "gpipeline6.graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.9s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.947360\n",
       "test_accuracy        0.843516\n",
       "test_neg_log_loss   -0.325666\n",
       "dtype: float64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline6,Xtrain,y_train,cv = cv,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Maybe we can try <b>SVD</b> to limit dimension of bag of char/word features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"290pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 290.05 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 286.046,-256 286.046,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.046\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.046\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"245.046\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"245.046\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M252.625,-144.055C251.524,-136.346 250.192,-127.027 248.955,-118.364\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"252.393,-117.68 247.513,-108.275 245.463,-118.67 252.393,-117.68\"/>\r\n",
       "</g>\r\n",
       "<!-- rf -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>rf</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"204.046\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"204.046\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">rf</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;rf -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M235.536,-72.7646C230.517,-64.1948 224.249,-53.494 218.63,-43.9004\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"221.577,-42.0072 213.503,-35.1473 215.537,-45.5451 221.577,-42.0072\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.046\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.046\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M255.046,-215.697C255.046,-207.983 255.046,-198.712 255.046,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"258.546,-190.104 255.046,-180.104 251.546,-190.104 258.546,-190.104\"/>\r\n",
       "</g>\r\n",
       "<!-- svd -->\r\n",
       "<g id=\"node5\" class=\"node\"><title>svd</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"164.046\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"164.046\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">svd</text>\r\n",
       "</g>\r\n",
       "<!-- svd&#45;&gt;rf -->\r\n",
       "<g id=\"edge4\" class=\"edge\"><title>svd&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M173.323,-72.7646C178.22,-64.1948 184.335,-53.494 189.817,-43.9004\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"192.896,-45.5663 194.819,-35.1473 186.818,-42.0932 192.896,-45.5663\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_word -->\r\n",
       "<g id=\"node6\" class=\"node\"><title>vect_word</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"50.0456\" cy=\"-162\" rx=\"50.0912\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"50.0456\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_word</text>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;svd -->\r\n",
       "<g id=\"edge5\" class=\"edge\"><title>vect_word&#45;&gt;svd</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M74.222,-146.155C92.3902,-134.999 117.307,-119.699 136.29,-108.043\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"138.121,-111.025 144.812,-102.81 134.459,-105.06 138.121,-111.025\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char -->\r\n",
       "<g id=\"node7\" class=\"node\"><title>vect_char</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"164.046\" cy=\"-162\" rx=\"46.2923\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"164.046\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_char</text>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;svd -->\r\n",
       "<g id=\"edge6\" class=\"edge\"><title>vect_char&#45;&gt;svd</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M164.046,-143.697C164.046,-135.983 164.046,-126.712 164.046,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"167.546,-118.104 164.046,-108.104 160.546,-118.104 167.546,-118.104\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x15551f65f60>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline7 = GraphPipeline(models = {\n",
    "    \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "    \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "    \"imp\":NumImputer(),\n",
    "    \"vect_word\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "    \"vect_char\":CountVectorizerWrapper(analyzer=\"char\",ngram_range=(1,4),columns_to_use=text_cols),\n",
    "    \"svd\":TruncatedSVDWrapper(n_components=100, random_state=123),\n",
    "    \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                       },\n",
    "              edges = [(\"sel\", \"enc\",\"imp\",\"rf\"),(\"vect_word\",\"svd\",\"rf\"),(\"vect_char\",\"svd\",\"rf\")])\n",
    "gpipeline7.fit(Xtrain,y_train)\n",
    "gpipeline7.graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   23.4s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.992953\n",
       "test_accuracy        0.972326\n",
       "test_neg_log_loss   -0.167037\n",
       "dtype: float64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline7,Xtrain,y_train,cv = 10,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can even add 'SVD' columns AND bag of word/char columns "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"290pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 290.05 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 286.046,-256 286.046,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.046\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.046\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"237.046\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"237.046\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M250.688,-144.055C248.654,-136.145 246.184,-126.54 243.908,-117.688\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"247.243,-116.602 241.363,-107.789 240.463,-118.346 247.243,-116.602\"/>\r\n",
       "</g>\r\n",
       "<!-- rf -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>rf</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"136.046\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"136.046\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">rf</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;rf -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M218.989,-76.4854C203.288,-65.6034 180.33,-49.6924 162.547,-37.3671\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"164.499,-34.462 154.287,-31.6423 160.512,-40.2153 164.499,-34.462\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.046\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.046\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M255.046,-215.697C255.046,-207.983 255.046,-198.712 255.046,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"258.546,-190.104 255.046,-180.104 251.546,-190.104 258.546,-190.104\"/>\r\n",
       "</g>\r\n",
       "<!-- svd -->\r\n",
       "<g id=\"node5\" class=\"node\"><title>svd</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"109.046\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"109.046\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">svd</text>\r\n",
       "</g>\r\n",
       "<!-- svd&#45;&gt;rf -->\r\n",
       "<g id=\"edge4\" class=\"edge\"><title>svd&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M115.444,-72.411C118.559,-64.3352 122.379,-54.4312 125.88,-45.3547\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"129.172,-46.5458 129.505,-35.9562 122.641,-44.0267 129.172,-46.5458\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_word -->\r\n",
       "<g id=\"node6\" class=\"node\"><title>vect_word</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"50.0456\" cy=\"-162\" rx=\"50.0912\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"50.0456\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_word</text>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;rf -->\r\n",
       "<g id=\"edge6\" class=\"edge\"><title>vect_word&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M51.3777,-143.703C53.5128,-124.812 59.0104,-94.2802 73.0456,-72 81.897,-57.9487 95.691,-45.7347 108.118,-36.5846\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"110.25,-39.364 116.425,-30.7545 106.229,-33.6342 110.25,-39.364\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;svd -->\r\n",
       "<g id=\"edge5\" class=\"edge\"><title>vect_word&#45;&gt;svd</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M64.0277,-144.411C71.6949,-135.314 81.3172,-123.898 89.6955,-113.958\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"92.6019,-115.94 96.3705,-106.038 87.2495,-111.429 92.6019,-115.94\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char -->\r\n",
       "<g id=\"node7\" class=\"node\"><title>vect_char</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"164.046\" cy=\"-162\" rx=\"46.2923\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"164.046\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_char</text>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;rf -->\r\n",
       "<g id=\"edge8\" class=\"edge\"><title>vect_char&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M160.668,-143.871C155.875,-119.564 147.052,-74.8187 141.372,-46.0132\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"144.804,-45.3232 139.435,-36.1893 137.936,-46.6775 144.804,-45.3232\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;svd -->\r\n",
       "<g id=\"edge7\" class=\"edge\"><title>vect_char&#45;&gt;svd</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M151.011,-144.411C144.034,-135.53 135.319,-124.439 127.642,-114.668\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"130.163,-112.212 121.233,-106.511 124.659,-116.536 130.163,-112.212\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x15552154c50>"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline8 = GraphPipeline(models = {\n",
    "    \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "    \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "    \"imp\":NumImputer(),\n",
    "    \"vect_word\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "    \"vect_char\":CountVectorizerWrapper(analyzer=\"char\",ngram_range=(1,4),columns_to_use=text_cols),\n",
    "    \"svd\":TruncatedSVDWrapper(n_components=100, random_state=123),\n",
    "    \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                       },\n",
    "            edges = [(\"sel\",\"enc\",\"imp\",\"rf\"),(\"vect_word\",\"svd\",\"rf\"),(\"vect_char\",\"svd\",\"rf\"),(\"vect_word\",\"rf\"),(\"vect_char\",\"rf\")])\n",
    "\n",
    "gpipeline8.graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   22.1s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.941329\n",
       "test_accuracy        0.834011\n",
       "test_neg_log_loss   -0.334545\n",
       "dtype: float64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline8,Xtrain,y_train,cv = 10,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instead of 'SVD' we can add a layer that filter columns... "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from aikit.transformers import FeaturesSelectorClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"290pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 290.05 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 286.046,-256 286.046,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.046\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.046\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"251.046\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"251.046\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M254.057,-143.697C253.616,-135.983 253.086,-126.712 252.595,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"256.088,-117.888 252.023,-108.104 249.099,-118.288 256.088,-117.888\"/>\r\n",
       "</g>\r\n",
       "<!-- rf -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>rf</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"207.046\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"207.046\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">rf</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;rf -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M241.061,-73.1159C235.548,-64.345 228.583,-53.2637 222.393,-43.4162\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"225.275,-41.425 216.99,-34.8212 219.349,-45.1502 225.275,-41.425\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"255.046\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"255.046\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M255.046,-215.697C255.046,-207.983 255.046,-198.712 255.046,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"258.546,-190.104 255.046,-180.104 251.546,-190.104 258.546,-190.104\"/>\r\n",
       "</g>\r\n",
       "<!-- selector -->\r\n",
       "<g id=\"node5\" class=\"node\"><title>selector</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"164.046\" cy=\"-90\" rx=\"38.9931\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"164.046\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">selector</text>\r\n",
       "</g>\r\n",
       "<!-- selector&#45;&gt;rf -->\r\n",
       "<g id=\"edge4\" class=\"edge\"><title>selector&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M174.236,-72.411C179.506,-63.8323 186.043,-53.1908 191.891,-43.6695\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"194.972,-45.3409 197.224,-34.9881 189.008,-41.6769 194.972,-45.3409\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_word -->\r\n",
       "<g id=\"node6\" class=\"node\"><title>vect_word</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"50.0456\" cy=\"-162\" rx=\"50.0912\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"50.0456\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_word</text>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;selector -->\r\n",
       "<g id=\"edge5\" class=\"edge\"><title>vect_word&#45;&gt;selector</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M74.222,-146.155C91.297,-135.67 114.333,-121.525 132.802,-110.185\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"134.764,-113.087 141.455,-104.872 131.102,-107.122 134.764,-113.087\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char -->\r\n",
       "<g id=\"node7\" class=\"node\"><title>vect_char</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"164.046\" cy=\"-162\" rx=\"46.2923\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"164.046\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_char</text>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;selector -->\r\n",
       "<g id=\"edge6\" class=\"edge\"><title>vect_char&#45;&gt;selector</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M164.046,-143.697C164.046,-135.983 164.046,-126.712 164.046,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"167.546,-118.104 164.046,-108.104 160.546,-118.104 167.546,-118.104\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x155535b20b8>"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline9 = GraphPipeline(models = {\n",
    "    \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "    \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "    \"imp\":NumImputer(),\n",
    "    \"vect_word\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "    \"vect_char\":CountVectorizerWrapper(analyzer=\"char\",ngram_range=(1,4),columns_to_use=text_cols),\n",
    "    \"selector\":FeaturesSelectorClassifier(n_components=20),\n",
    "    \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                       },\n",
    "              edges = [(\"sel\",\"enc\",\"imp\",\"rf\"),(\"vect_word\",\"selector\",\"rf\"),(\"vect_char\",\"selector\",\"rf\")])\n",
    "\n",
    "gpipeline9.graphviz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Retrieve feature importance\n",
    "Let's use that complicated example to show how to retrieve the feature importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "boat____null__             3.839758e-01\n",
       "sex__female                3.816301e-02\n",
       "name__BAG__mr              3.715979e-02\n",
       "name__BAG__mr.             3.636483e-02\n",
       "fare                       3.419880e-02\n",
       "name__BAG__mr.             3.133609e-02\n",
       "sex__male                  2.962421e-02\n",
       "name__BAG__r.              2.910019e-02\n",
       "name__BAG__s.              2.776609e-02\n",
       "boat__15                   2.672268e-02\n",
       "age                        2.643157e-02\n",
       "name__BAG__s.              2.500470e-02\n",
       "name__BAG__ mr.            2.249752e-02\n",
       "boat__13                   1.863079e-02\n",
       "boat____default__          1.711391e-02\n",
       "pclass                     1.665125e-02\n",
       "name__BAG__                1.597853e-02\n",
       "sibsp                      1.524516e-02\n",
       "home_dest____null__        1.015056e-02\n",
       "boat__7                    9.817018e-03\n",
       "home_dest____default__     9.534058e-03\n",
       "boat__C                    9.453317e-03\n",
       "cabin____null__            8.265959e-03\n",
       "cabin____default__         7.290940e-03\n",
       "parch                      7.138940e-03\n",
       "embarked__S                6.643220e-03\n",
       "boat__5                    6.206360e-03\n",
       "name__BAG__iss.            6.139824e-03\n",
       "embarked__C                6.040638e-03\n",
       "boat__3                    5.547742e-03\n",
       "name__BAG__(               5.352397e-03\n",
       "name__BAG__mr              5.260205e-03\n",
       "body_isnull                4.829877e-03\n",
       "name__BAG__ (              4.360392e-03\n",
       "boat__16                   4.245866e-03\n",
       "boat__9                    4.224166e-03\n",
       "boat__D                    4.194419e-03\n",
       "name__BAG__ss              4.076246e-03\n",
       "embarked__Q                4.047912e-03\n",
       "name__BAG__mrs             3.602001e-03\n",
       "body                       2.955222e-03\n",
       "name__BAG__rs              2.899086e-03\n",
       "name__BAG__rs.             2.869114e-03\n",
       "age_isnull                 2.859144e-03\n",
       "boat__14                   2.809765e-03\n",
       "boat__10                   2.695927e-03\n",
       "name__BAG__rs.             2.165917e-03\n",
       "boat__12                   2.103210e-03\n",
       "name__BAG__mrs.            2.064884e-03\n",
       "home_dest__New York, NY    1.799501e-03\n",
       "boat__11                   1.495248e-03\n",
       "name__BAG__miss            1.054318e-03\n",
       "name__BAG__mrs             9.616334e-04\n",
       "boat__4                    9.420111e-04\n",
       "boat__6                    8.184419e-04\n",
       "home_dest__London          7.515602e-04\n",
       "boat__8                    3.679950e-04\n",
       "fare_isnull                2.438310e-09\n",
       "dtype: float64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline9.fit(Xtrain, y_train)\n",
    "\n",
    "df_imp = pd.Series(gpipeline9.models[\"rf\"].feature_importances_,\n",
    "                  index = gpipeline9.get_input_features_at_node(\"rf\"))\n",
    "df_imp.sort_values(ascending=False,inplace=True)\n",
    "df_imp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   15.0s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.994108\n",
       "test_accuracy        0.973288\n",
       "test_neg_log_loss   -0.153255\n",
       "dtype: float64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline9,Xtrain,y_train,cv = 10,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\r\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\r\n",
       " -->\r\n",
       "<!-- Title: %3 Pages: 1 -->\r\n",
       "<svg width=\"290pt\" height=\"260pt\"\r\n",
       " viewBox=\"0.00 0.00 290.05 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\r\n",
       "<title>%3</title>\r\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 286.046,-256 286.046,4 -4,4\"/>\r\n",
       "<!-- enc -->\r\n",
       "<g id=\"node1\" class=\"node\"><title>enc</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">enc</text>\r\n",
       "</g>\r\n",
       "<!-- imp -->\r\n",
       "<g id=\"node2\" class=\"node\"><title>imp</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"37\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"37\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">imp</text>\r\n",
       "</g>\r\n",
       "<!-- enc&#45;&gt;imp -->\r\n",
       "<g id=\"edge1\" class=\"edge\"><title>enc&#45;&gt;imp</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M29.4207,-144.055C30.5221,-136.346 31.8533,-127.027 33.0908,-118.364\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"36.5826,-118.67 34.5321,-108.275 29.653,-117.68 36.5826,-118.67\"/>\r\n",
       "</g>\r\n",
       "<!-- rf -->\r\n",
       "<g id=\"node4\" class=\"node\"><title>rf</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"118\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"118\" y=\"-14.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">rf</text>\r\n",
       "</g>\r\n",
       "<!-- imp&#45;&gt;rf -->\r\n",
       "<g id=\"edge3\" class=\"edge\"><title>imp&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M53.003,-75.1703C64.7627,-65.0075 80.9362,-51.0304 94.2075,-39.5614\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"96.7526,-41.9878 102.03,-32.801 92.1756,-36.6915 96.7526,-41.9878\"/>\r\n",
       "</g>\r\n",
       "<!-- sel -->\r\n",
       "<g id=\"node3\" class=\"node\"><title>sel</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"27\" cy=\"-234\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-230.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">sel</text>\r\n",
       "</g>\r\n",
       "<!-- sel&#45;&gt;enc -->\r\n",
       "<g id=\"edge2\" class=\"edge\"><title>sel&#45;&gt;enc</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M27,-215.697C27,-207.983 27,-198.712 27,-190.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"30.5001,-190.104 27,-180.104 23.5001,-190.104 30.5001,-190.104\"/>\r\n",
       "</g>\r\n",
       "<!-- selector -->\r\n",
       "<g id=\"node5\" class=\"node\"><title>selector</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"202\" cy=\"-90\" rx=\"38.9931\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"202\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">selector</text>\r\n",
       "</g>\r\n",
       "<!-- selector&#45;&gt;rf -->\r\n",
       "<g id=\"edge4\" class=\"edge\"><title>selector&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M183.773,-73.811C171.546,-63.6216 155.31,-50.0918 142.029,-39.0243\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"144.127,-36.2163 134.204,-32.5032 139.645,-41.5938 144.127,-36.2163\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_word -->\r\n",
       "<g id=\"node6\" class=\"node\"><title>vect_word</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"232\" cy=\"-162\" rx=\"50.0912\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"232\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_word</text>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;selector -->\r\n",
       "<g id=\"edge5\" class=\"edge\"><title>vect_word&#45;&gt;selector</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M224.738,-144.055C221.274,-135.973 217.052,-126.121 213.191,-117.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"216.352,-115.602 209.195,-107.789 209.918,-118.359 216.352,-115.602\"/>\r\n",
       "</g>\r\n",
       "<!-- svd -->\r\n",
       "<g id=\"node7\" class=\"node\"><title>svd</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"118\" cy=\"-90\" rx=\"27\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"118\" y=\"-86.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">svd</text>\r\n",
       "</g>\r\n",
       "<!-- vect_word&#45;&gt;svd -->\r\n",
       "<g id=\"edge6\" class=\"edge\"><title>vect_word&#45;&gt;svd</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M207.824,-146.155C189.655,-134.999 164.738,-119.699 145.756,-108.043\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"147.587,-105.06 137.234,-102.81 143.924,-111.025 147.587,-105.06\"/>\r\n",
       "</g>\r\n",
       "<!-- svd&#45;&gt;rf -->\r\n",
       "<g id=\"edge9\" class=\"edge\"><title>svd&#45;&gt;rf</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M118,-71.6966C118,-63.9827 118,-54.7125 118,-46.1124\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"121.5,-46.1043 118,-36.1043 114.5,-46.1044 121.5,-46.1043\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char -->\r\n",
       "<g id=\"node8\" class=\"node\"><title>vect_char</title>\r\n",
       "<ellipse fill=\"lightblue\" stroke=\"lightblue\" cx=\"118\" cy=\"-162\" rx=\"46.2923\" ry=\"18\"/>\r\n",
       "<text text-anchor=\"middle\" x=\"118\" y=\"-158.3\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">vect_char</text>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;selector -->\r\n",
       "<g id=\"edge7\" class=\"edge\"><title>vect_char&#45;&gt;selector</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M136.642,-145.465C148.22,-135.817 163.269,-123.275 176.008,-112.66\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"178.545,-115.102 183.986,-106.011 174.063,-109.724 178.545,-115.102\"/>\r\n",
       "</g>\r\n",
       "<!-- vect_char&#45;&gt;svd -->\r\n",
       "<g id=\"edge8\" class=\"edge\"><title>vect_char&#45;&gt;svd</title>\r\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M118,-143.697C118,-135.983 118,-126.712 118,-118.112\"/>\r\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"121.5,-118.104 118,-108.104 114.5,-118.104 121.5,-118.104\"/>\r\n",
       "</g>\r\n",
       "</g>\r\n",
       "</svg>\r\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x1555426df60>"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gpipeline10 = GraphPipeline(models = {\n",
    "    \"sel\":ColumnsSelector(columns_to_use=non_text_cols),\n",
    "    \"enc\":NumericalEncoder(columns_to_use=\"object\"),\n",
    "    \"imp\":NumImputer(),\n",
    "    \"vect_word\":CountVectorizerWrapper(analyzer=\"word\",columns_to_use=text_cols),\n",
    "    \"vect_char\":CountVectorizerWrapper(analyzer=\"char\",ngram_range=(1,4),columns_to_use=text_cols),\n",
    "    \"svd\":TruncatedSVDWrapper(n_components=10),\n",
    "    \"selector\":FeaturesSelectorClassifier(n_components=10, random_state=123),\n",
    "    \"rf\":RandomForestClassifier(n_estimators=100, random_state=123)\n",
    "                       },\n",
    "              edges = [(\"sel\",\"enc\",\"imp\",\"rf\"),\n",
    "                       (\"vect_word\",\"selector\",\"rf\"),\n",
    "                       (\"vect_char\",\"selector\",\"rf\"),\n",
    "                       (\"vect_word\",\"svd\",\"rf\"),\n",
    "                       (\"vect_char\",\"svd\",\"rf\")])\n",
    "\n",
    "gpipeline10.fit(Xtrain,y_train)\n",
    "gpipeline10.graphviz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this model here is what is done :\n",
    "* categorical columns are encoded ('enc')\n",
    "* missing values are filled  ('imp')\n",
    "* bag of word and bag of char are created, for the two text features\n",
    "* an SVD is done on those \n",
    "* a selector is called to select most important bag of word/char features\n",
    "* everything is given to a RandomForest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 0 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 1 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 2 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 3 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 4 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 5 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 6 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 7 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 8 started\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cv 9 started\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.9s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "test_roc_auc         0.994333\n",
       "test_accuracy        0.975201\n",
       "test_neg_log_loss   -0.143788\n",
       "dtype: float64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_result = cross_validation(gpipeline10,Xtrain,y_train,cv = 10,scoring=[\"roc_auc\",\"accuracy\",\"neg_log_loss\"])\n",
    "cv_result.loc[:,(\"test_roc_auc\",\"test_accuracy\",\"test_neg_log_loss\")].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we saw the GraphPipeline allow flexibility in the creation of models and several choices can be easily tested.\n",
    "\n",
    "<b>Again, it is not the best possible choices for that database, the example are here to illustrate the capabilities.</b>\n",
    "\n",
    "Better score could be obtained by adjusting hyper-parameters and/or models/transformers and creating some new features.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}