SVM

Contents

import pandas as pd
import numpy as np
from sklearn import feature_extraction
from sklearn import svm
from sklearn import preprocessing
from scipy import sparse

Load Data and Train/Test Split(s)

df=pd.read_json("../data/merged_troll_data.json")
df.shape
(332504, 8)
df.sample(5).head()
content followers following retweet account_category created_at troll orig_index
307969 @AIIAmericanGirI @CommonSense1212 @realDonaldT... 2267 219 0 NonTroll 2016-11-03 14:35:17 False 31589
135312 '\| ̄ ̄ ̄ ̄ ̄ ̄\| \| 33 \... 629 364 1 LeftTroll 2016-10-06 13:33:00 True 331179
82234 '@WOOKIE318 I hope you also pissed off when Cl... 12321 9091 0 RightTroll 2016-09-08 01:39:00 True 252179
300364 RT @HillaryClinton: "Everything I’ve done star... 3228 3703 1 NonTroll 2016-11-04 03:02:13 False 5521
37315 "Understanding will never bring you peace. Tha... 542 686 1 LeftTroll 2016-08-07 13:32:00 True 303267
ids=pd.read_json("../data/train_test_inds.json")
len(ids.random.train)
266003

Prepare feature matrix

Isolate matrices

def getxy(ids, feature_cols=['content', 'followers', 'following', 'retweet'], label_col=['troll']):
    return df[feature_cols].iloc[ids], df[label_col].iloc[ids]
# random
Xrand_train, yrand_train = getxy(ids.random.train)
Xrand_val, yrand_val = getxy(ids.random.val)
Xrand_test, yrand_test = getxy(ids.random.test)

# temporal
Xtemp_train, ytemp_train = getxy(ids.temporal.train)
Xtemp_val, ytemp_val = getxy(ids.temporal.val)
Xtemp_test, ytemp_test = getxy(ids.temporal.test)
Xrand_train.head()
content followers following retweet
204024 RT @businessinsider: OBAMA: The press doesn’t ... 14525 3311 1
45854 Review: Generation Startup https://t.co/lej8O8... 3086 2387 1
199686 RT @Kidrambler: @TomiLahren Vote for Gary John... 1117 3742 1
115712 in interpersonal relations with pple who are m... 936 582 1
245728 RT @PeterTownsend7: The Real #WarOnWomen #isi... 2891 1615 1
Xrand_train.shape, yrand_train.shape
((266003, 4), (266003, 1))

Tokenize content

vocab_size=5000
tokenizer=feature_extraction.text.CountVectorizer(stop_words='english', max_features=vocab_size)
tokenizer=tokenizer.fit(df['content'])
Xrand_train_tok=tokenizer.transform(Xrand_train['content'])
Xrand_val_tok=tokenizer.transform(Xrand_val['content'])
Xrand_test_tok=tokenizer.transform(Xrand_test['content'])

Xtemp_train_tok=tokenizer.transform(Xtemp_train['content'])
Xtemp_val_tok=tokenizer.transform(Xtemp_val['content'])
Xtemp_test_tok=tokenizer.transform(Xtemp_test['content'])
Xrand_train_tok.shape # token matrix dim = n x vocab_size
(266003, 5000)

Standardize followers/following

# one for each split
rand_scaler = preprocessing.StandardScaler().fit(Xrand_train[['followers','following']])
temp_scaler = preprocessing.StandardScaler().fit(Xtemp_train[['followers','following']])
print('rand means and scales: {}, {}'.format(rand_scaler.mean_, rand_scaler.scale_))
print('temp means and scales: {}, {}'.format(temp_scaler.mean_, rand_scaler.scale_))
rand means and scales: [8154.90645218 3016.03233422], [219679.05451009   7816.52064337]
temp means and scales: [8757.68069533 3020.22409146], [219679.05451009   7816.52064337]

They are very close. Could probably just use a single one, but I will use both anyways, in case it makes a difference.

col_to_std = ['followers', 'following']
Xrand_train[col_to_std]=rand_scaler.transform(Xrand_train[col_to_std])
Xrand_val[col_to_std]=rand_scaler.transform(Xrand_val[col_to_std])
Xrand_test[col_to_std]=rand_scaler.transform(Xrand_test[col_to_std])

Xtemp_train[col_to_std]=temp_scaler.transform(Xtemp_train[col_to_std])
Xtemp_val[col_to_std]=temp_scaler.transform(Xtemp_val[col_to_std])
Xtemp_test[col_to_std]=temp_scaler.transform(Xtemp_test[col_to_std])
Xrand_train[col_to_std].head()
followers following
204024 0.028997 0.037736
45854 -0.023074 -0.080475
199686 -0.032037 0.092876
115712 -0.032861 -0.311396
245728 -0.023962 -0.179240

Binarize the boolean outcome

yrand_train.head()
troll
204024 False
45854 True
199686 False
115712 True
245728 False
bool_to_bin = lambda x: 1 if x else 0
yrand_train['troll'] = yrand_train['troll'].apply(bool_to_bin)
yrand_train.head()
troll
204024 0
45854 1
199686 0
115712 1
245728 0
yrand_val['troll'] = yrand_val['troll'].apply(bool_to_bin)
yrand_test['troll'] = yrand_test['troll'].apply(bool_to_bin)

ytemp_train['troll'] = ytemp_train['troll'].apply(bool_to_bin)
ytemp_val['troll'] = ytemp_val['troll'].apply(bool_to_bin)
ytemp_test['troll'] = ytemp_test['troll'].apply(bool_to_bin)

Concatenate features

def concatenate_features(tok_matrix, data_df):
    """ concatenate the tokenized matrix (scipy.sparse) with other features """
    sparse_cols = sparse.csr_matrix(data_df[['followers', 'following', 'retweet']])
    combined = sparse.hstack([tok_matrix, sparse_cols])
    return combined
Xrand_train_combined = concatenate_features(Xrand_train_tok, Xrand_train)
Xrand_val_combined = concatenate_features(Xrand_val_tok, Xrand_val)
Xrand_test_combined = concatenate_features(Xrand_test_tok, Xrand_test)

Xtemp_train_combined = concatenate_features(Xtemp_train_tok, Xtemp_train)
Xtemp_val_combined = concatenate_features(Xtemp_val_tok, Xtemp_val)
Xtemp_test_combined = concatenate_features(Xtemp_test_tok, Xtemp_test)

Train the model(s)

Using only text

# random split
svm_model = svm.SVC().fit(Xrand_train_tok, yrand_train['troll'])
svm_model.score(Xrand_train_tok, yrand_train['troll'])
0.8563023725296331
svm_model.score(Xrand_val_tok, yrand_val['troll'])
0.8545563909774436
svm_model.score(Xrand_test_tok, yrand_test['troll'])
0.8563652220985835
# temporal split
svm_temp = svm.SVC().fit(Xtemp_train_tok, ytemp_train['troll'])
svm_temp.score(Xtemp_val_tok, ytemp_val['troll'])
0.8595187969924812
svm_temp.score(Xtemp_test_tok, ytemp_test['troll'])
0.8649664671739196

Using all features

# random split
svm_rand_all = svm.SVC().fit(Xrand_train_combined, yrand_train['troll'])
svm_rand_all.score(Xrand_train_combined, yrand_train['troll'])
0.9306173238647685
svm_rand_all.score(Xrand_test_combined, yrand_test['troll'])
0.928693873868455
# temporal split
svm_temp_all = svm.SVC().fit(Xtemp_train_combined, ytemp_train['troll'])
svm_temp_all.score(Xtemp_train_combined, ytemp_train['troll'])
svm_temp_all.score(Xtemp_test_combined, ytemp_test['troll'])
0.8745902378875823