XGBoost using TF-IDF and Word Distance Features

Note, this resulted in a log-loss score of 0.39 on the test data set

Import the necessary modules

In [43]:
##########################################
# Load Required Python Libraries
##########################################
stop_words = set(stopwords.words("english"))
import pandas as pd
import numpy as np
import scipy
import xgboost as xgb
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pylev import levenshtein
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import word2vec, KeyedVectors
from scipy.stats import kurtosis
from sklearn.externals import joblib
import xgboost as xgb
##########################################
In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)
y = data['is_duplicate']
df_train = data
##########################################

The word_lengths function returns the number of common words between two questions

In [58]:
def word_lengths(q1,q2):
    data = pd.concat([q1, q2], axis=1)

    #Length of Question
    data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
    data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

    #Feature: Difference in length between the Questions
    data['len_diff'] = data.len_q1 - data.len_q2
    
    #Word count of Question
    data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
    data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
    
    #Feature: Difference in length between the words
    data['word_diff'] = data.len_word_q1 - data.len_word_q2
    
    #Feature: Common words between the Questions
    data['len_common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    final_word_features = [data['len_diff'], data['word_diff'], data['len_common_words']]
    final_word_features = np.column_stack((np.array(data['len_diff']),
                                          np.array(data['word_diff']),
                                          np.array(data['len_common_words']),
                                          ))
    return final_word_features

def shared_words(q1,q2):
    question1_words = []
    question2_words = []

    for word in set(str(q1).lower().split()):
        if word not in stop_words:
            question1_words.append(word)

    for word in set(str(q2).lower().split()):
        if word not in stop_words:
            question2_words.append(word)

    #Question contains only stop words (or is an empty string)
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0

    question1_shared_words = [w for w in question1_words if w in question2_words]
    question2_shared_words = [w for w in question2_words if w in question1_words]

    avg_words_shared = (len(question1_shared_words) + len(question2_shared_words))/(len(question1_words) + len(question2_words))
    return avg_words_shared

These objects will be used to create each of the separate features we'll be feeding into our machine learning algorithm

In [61]:
class LevDistanceTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        
        lev_distance_strings = [[a,b] 
        for a,b in zip(q1_list, q2_list)]
        
        lev_dist_array = np.array([
    (float(levenshtein(pair[0], pair[1]))/
    (float(sum([x.count('') for x in pair[0]])) + 
    float(sum([x.count('') for x in pair[1]])))) 
    for pair in lev_distance_strings 
        ])
        
        return lev_dist_array.reshape(len(lev_dist_array),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class TfIdfDiffTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self, total_words):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = q1_list + q2_list
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english', vocabulary = total_words)
        vectorizer.fit(total_questions)
        tf_diff = vectorizer.transform(q1_list) - vectorizer.transform(q2_list)
        return tf_diff

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class CosineDistTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = q1_list + q2_list
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english')
        vectorizer.fit(total_questions)
        
        q1_tf = vectorizer.transform(q1_list) 
        q2_tf = vectorizer.transform(q2_list)
        cos_sim = []
        for i in range(0,len(q1_list)):
            cos_sim.append(cosine_similarity(q1_tf[i], q2_tf[i])[0][0])
            
        return np.array(cos_sim).reshape(len(cos_sim),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class AverageSharedWords(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        avg_words = [shared_words(q1,q2) for q1, q2 in zip(q1_list, q2_list)]

        return np.array(avg_words).reshape(len(avg_words),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

class WordLengths(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        word_len = word_lengths(q1_list, q2_list)
        return word_len

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self  

Build a aggregated feature transformer using FeatureUnion

In [66]:
##########################################
# Combining all the features using FeatureUnion
##########################################
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(df_train['question1'][0:5000] + df_train['question2'][0:5000])
#vectorizer.fit(df_train['question1'] + df_train['question2'])
total_words = list(set(vectorizer.get_feature_names()))

comb_features = FeatureUnion([('tf', TfIdfDiffTransformer(total_words)), 
                              ('cos_diff',CosineDistTransformer()), 
                              ('lev', LevDistanceTransformer()),
                              ('AvgWords', AverageSharedWords()),
                              ('WordLengths', WordLengths())
                             ])
##########################################

Split into testing and training using train test split

In [74]:
##########################################
# Split the dataset into training and testing datasets
# ##########################################
y = df_train.ix[:,'is_duplicate']
all_features = comb_features.transform([df_train['question1'], df_train['question2']])
X_train, X_test, y_train, y_test = train_test_split(all_features, y, test_size=0.2, random_state=1317)
##########################################

Run xgboost

In [75]:
##########################################
# Running XGBoost
##########################################
# Set parameters for XGBoost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 10

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=100, verbose_eval=100)
joblib.dump(bst, 'xgboost_model_400iterations_8depth.pkl')
##########################################
[0]	train-logloss:0.68622	test-logloss:0.686336
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.467485	test-logloss:0.477014
[200]	train-logloss:0.437951	test-logloss:0.451076
[300]	train-logloss:0.426012	test-logloss:0.441089
[399]	train-logloss:0.418002	test-logloss:0.434636
Out[75]:
['xgboost_model_400iterations_8depth.pkl']
In [76]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
df_test = pd.read_csv('test.csv')

#Replaces np.nan with ''
df_test = df_test.replace(np.nan, '', regex=True)

#Saves the cleaned test.csv
df_test.to_csv('cleaned_test.csv')
##########################################
In [77]:
##########################################
# Create the test features using FeatureUnion
##########################################
# test_features = comb_features.transform([df_test['question1'][0:5000], df_test['question2'][0:5000]])
test_features = comb_features.transform([df_test['question1'], df_test['question2']])
joblib.dump(test_features, 'test_features.pkl')
##########################################
Out[77]:
['test_features.pkl']
In [78]:
##########################################
# Predicting using XGBoost
##########################################
test = xgb.DMatrix(test_features)
test_prediction = bst.predict(test)
##########################################
In [101]:
##########################################
# Creating Submission File
##########################################
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = test_prediction

##########################################
# Set probability to 0 for all test questions 
# that we know are not duplicates
##########################################
empty_questions = list(df_test[df_test['question1'] == '']['test_id']) + list(df_test[df_test['question2'] == '']['test_id'])
for question in empty_questions:
    sub.loc[question, 'is_duplicate'] = 0

sub.to_csv('simple_xgb.csv', index=False)
##########################################

Make sure the submission question length is the same as the test file

In [105]:
#Check Submission File Length
len(sub) == len(df_test)
Out[105]:
True