Note, this resulted in a log-loss score of 0.39 on the test data set
Import the necessary modules
##########################################
# Load Required Python Libraries
##########################################
stop_words = set(stopwords.words("english"))
import pandas as pd
import numpy as np
import scipy
import xgboost as xgb
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pylev import levenshtein
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import word2vec, KeyedVectors
from scipy.stats import kurtosis
from sklearn.externals import joblib
import xgboost as xgb
##########################################
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)
y = data['is_duplicate']
df_train = data
##########################################
The word_lengths function returns the number of common words between two questions
def word_lengths(q1,q2):
data = pd.concat([q1, q2], axis=1)
#Length of Question
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
#Feature: Difference in length between the Questions
data['len_diff'] = data.len_q1 - data.len_q2
#Word count of Question
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
#Feature: Difference in length between the words
data['word_diff'] = data.len_word_q1 - data.len_word_q2
#Feature: Common words between the Questions
data['len_common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
final_word_features = [data['len_diff'], data['word_diff'], data['len_common_words']]
final_word_features = np.column_stack((np.array(data['len_diff']),
np.array(data['word_diff']),
np.array(data['len_common_words']),
))
return final_word_features
def shared_words(q1,q2):
question1_words = []
question2_words = []
for word in set(str(q1).lower().split()):
if word not in stop_words:
question1_words.append(word)
for word in set(str(q2).lower().split()):
if word not in stop_words:
question2_words.append(word)
#Question contains only stop words (or is an empty string)
if len(question1_words) == 0 or len(question2_words) == 0:
return 0
question1_shared_words = [w for w in question1_words if w in question2_words]
question2_shared_words = [w for w in question2_words if w in question1_words]
avg_words_shared = (len(question1_shared_words) + len(question2_shared_words))/(len(question1_words) + len(question2_words))
return avg_words_shared
These objects will be used to create each of the separate features we'll be feeding into our machine learning algorithm
class LevDistanceTransformer(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self):
pass
def transform(self, question_list):
q1_list = question_list[0]
q2_list = question_list[1]
lev_distance_strings = [[a,b]
for a,b in zip(q1_list, q2_list)]
lev_dist_array = np.array([
(float(levenshtein(pair[0], pair[1]))/
(float(sum([x.count('') for x in pair[0]])) +
float(sum([x.count('') for x in pair[1]]))))
for pair in lev_distance_strings
])
return lev_dist_array.reshape(len(lev_dist_array),1)
def fit(self, question_list, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
class TfIdfDiffTransformer(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self, total_words):
pass
def transform(self, question_list):
q1_list = question_list[0]
q2_list = question_list[1]
total_questions = q1_list + q2_list
total_questions = [x for x in total_questions if type(x) != float]
vectorizer = TfidfVectorizer(stop_words = 'english', vocabulary = total_words)
vectorizer.fit(total_questions)
tf_diff = vectorizer.transform(q1_list) - vectorizer.transform(q2_list)
return tf_diff
def fit(self, question_list, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
class CosineDistTransformer(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self):
pass
def transform(self, question_list):
q1_list = question_list[0]
q2_list = question_list[1]
total_questions = q1_list + q2_list
total_questions = [x for x in total_questions if type(x) != float]
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(total_questions)
q1_tf = vectorizer.transform(q1_list)
q2_tf = vectorizer.transform(q2_list)
cos_sim = []
for i in range(0,len(q1_list)):
cos_sim.append(cosine_similarity(q1_tf[i], q2_tf[i])[0][0])
return np.array(cos_sim).reshape(len(cos_sim),1)
def fit(self, question_list, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
class AverageSharedWords(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self):
pass
def transform(self, question_list):
q1_list = question_list[0]
q2_list = question_list[1]
avg_words = [shared_words(q1,q2) for q1, q2 in zip(q1_list, q2_list)]
return np.array(avg_words).reshape(len(avg_words),1)
def fit(self, question_list, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
class WordLengths(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self):
pass
def transform(self, question_list):
q1_list = question_list[0]
q2_list = question_list[1]
word_len = word_lengths(q1_list, q2_list)
return word_len
def fit(self, question_list, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
Build a aggregated feature transformer using FeatureUnion
##########################################
# Combining all the features using FeatureUnion
##########################################
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(df_train['question1'][0:5000] + df_train['question2'][0:5000])
#vectorizer.fit(df_train['question1'] + df_train['question2'])
total_words = list(set(vectorizer.get_feature_names()))
comb_features = FeatureUnion([('tf', TfIdfDiffTransformer(total_words)),
('cos_diff',CosineDistTransformer()),
('lev', LevDistanceTransformer()),
('AvgWords', AverageSharedWords()),
('WordLengths', WordLengths())
])
##########################################
Split into testing and training using train test split
##########################################
# Split the dataset into training and testing datasets
# ##########################################
y = df_train.ix[:,'is_duplicate']
all_features = comb_features.transform([df_train['question1'], df_train['question2']])
X_train, X_test, y_train, y_test = train_test_split(all_features, y, test_size=0.2, random_state=1317)
##########################################
Run xgboost
##########################################
# Running XGBoost
##########################################
# Set parameters for XGBoost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 10
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)
watchlist = [(d_train, 'train'), (d_test, 'test')]
bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=100, verbose_eval=100)
joblib.dump(bst, 'xgboost_model_400iterations_8depth.pkl')
##########################################
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
df_test = pd.read_csv('test.csv')
#Replaces np.nan with ''
df_test = df_test.replace(np.nan, '', regex=True)
#Saves the cleaned test.csv
df_test.to_csv('cleaned_test.csv')
##########################################
##########################################
# Create the test features using FeatureUnion
##########################################
# test_features = comb_features.transform([df_test['question1'][0:5000], df_test['question2'][0:5000]])
test_features = comb_features.transform([df_test['question1'], df_test['question2']])
joblib.dump(test_features, 'test_features.pkl')
##########################################
##########################################
# Predicting using XGBoost
##########################################
test = xgb.DMatrix(test_features)
test_prediction = bst.predict(test)
##########################################
##########################################
# Creating Submission File
##########################################
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = test_prediction
##########################################
# Set probability to 0 for all test questions
# that we know are not duplicates
##########################################
empty_questions = list(df_test[df_test['question1'] == '']['test_id']) + list(df_test[df_test['question2'] == '']['test_id'])
for question in empty_questions:
sub.loc[question, 'is_duplicate'] = 0
sub.to_csv('simple_xgb.csv', index=False)
##########################################
Make sure the submission question length is the same as the test file
#Check Submission File Length
len(sub) == len(df_test)