SVM

Load Data and Train/Test Split(s)
Prepare feature matrix
Train the model(s)
- Using only text
- Using all features

import pandas as pd
import numpy as np
from sklearn import feature_extraction
from sklearn import svm
from sklearn import preprocessing
from scipy import sparse

Load Data and Train/Test Split(s)

df=pd.read_json("../data/merged_troll_data.json")

df.shape

(332504, 8)

df.sample(5).head()

	content	followers	following	retweet	account_category	created_at	troll	orig_index
307969	@AIIAmericanGirI @CommonSense1212 @realDonaldT...	2267	219	0	NonTroll	2016-11-03 14:35:17	False	31589
135312	'\\|ï¿£ï¿£ï¿£ï¿£ï¿£ï¿£\\| \\| 33 \...	629	364	1	LeftTroll	2016-10-06 13:33:00	True	331179
82234	'@WOOKIE318 I hope you also pissed off when Cl...	12321	9091	0	RightTroll	2016-09-08 01:39:00	True	252179
300364	RT @HillaryClinton: "Everything I’ve done star...	3228	3703	1	NonTroll	2016-11-04 03:02:13	False	5521
37315	"Understanding will never bring you peace. Tha...	542	686	1	LeftTroll	2016-08-07 13:32:00	True	303267

ids=pd.read_json("../data/train_test_inds.json")

len(ids.random.train)

Prepare feature matrix

Isolate matrices

def getxy(ids, feature_cols=['content', 'followers', 'following', 'retweet'], label_col=['troll']):
    return df[feature_cols].iloc[ids], df[label_col].iloc[ids]

# random
Xrand_train, yrand_train = getxy(ids.random.train)
Xrand_val, yrand_val = getxy(ids.random.val)
Xrand_test, yrand_test = getxy(ids.random.test)

# temporal
Xtemp_train, ytemp_train = getxy(ids.temporal.train)
Xtemp_val, ytemp_val = getxy(ids.temporal.val)
Xtemp_test, ytemp_test = getxy(ids.temporal.test)

Xrand_train.head()

	content	followers	following	retweet
204024	RT @businessinsider: OBAMA: The press doesn’t ...	14525	3311	1
45854	Review: Generation Startup https://t.co/lej8O8...	3086	2387	1
199686	RT @Kidrambler: @TomiLahren Vote for Gary John...	1117	3742	1
115712	in interpersonal relations with pple who are m...	936	582	1
245728	RT @PeterTownsend7: The Real #WarOnWomen #isi...	2891	1615	1

Xrand_train.shape, yrand_train.shape

((266003, 4), (266003, 1))

Tokenize content

vocab_size=5000
tokenizer=feature_extraction.text.CountVectorizer(stop_words='english', max_features=vocab_size)
tokenizer=tokenizer.fit(df['content'])

Xrand_train_tok=tokenizer.transform(Xrand_train['content'])
Xrand_val_tok=tokenizer.transform(Xrand_val['content'])
Xrand_test_tok=tokenizer.transform(Xrand_test['content'])

Xtemp_train_tok=tokenizer.transform(Xtemp_train['content'])
Xtemp_val_tok=tokenizer.transform(Xtemp_val['content'])
Xtemp_test_tok=tokenizer.transform(Xtemp_test['content'])

Xrand_train_tok.shape # token matrix dim = n x vocab_size

(266003, 5000)

Standardize followers/following

# one for each split
rand_scaler = preprocessing.StandardScaler().fit(Xrand_train[['followers','following']])
temp_scaler = preprocessing.StandardScaler().fit(Xtemp_train[['followers','following']])

print('rand means and scales: {}, {}'.format(rand_scaler.mean_, rand_scaler.scale_))
print('temp means and scales: {}, {}'.format(temp_scaler.mean_, rand_scaler.scale_))

rand means and scales: [8154.90645218 3016.03233422], [219679.05451009   7816.52064337]
temp means and scales: [8757.68069533 3020.22409146], [219679.05451009   7816.52064337]

They are very close. Could probably just use a single one, but I will use both anyways, in case it makes a difference.

col_to_std = ['followers', 'following']
Xrand_train[col_to_std]=rand_scaler.transform(Xrand_train[col_to_std])
Xrand_val[col_to_std]=rand_scaler.transform(Xrand_val[col_to_std])
Xrand_test[col_to_std]=rand_scaler.transform(Xrand_test[col_to_std])

Xtemp_train[col_to_std]=temp_scaler.transform(Xtemp_train[col_to_std])
Xtemp_val[col_to_std]=temp_scaler.transform(Xtemp_val[col_to_std])
Xtemp_test[col_to_std]=temp_scaler.transform(Xtemp_test[col_to_std])

Xrand_train[col_to_std].head()

	followers	following
204024	0.028997	0.037736
45854	-0.023074	-0.080475
199686	-0.032037	0.092876
115712	-0.032861	-0.311396
245728	-0.023962	-0.179240

Binarize the boolean outcome

yrand_train.head()

	troll
204024	False
45854	True
199686	False
115712	True
245728	False

bool_to_bin = lambda x: 1 if x else 0
yrand_train['troll'] = yrand_train['troll'].apply(bool_to_bin)
yrand_train.head()

	troll
204024	0
45854	1
199686	0
115712	1
245728	0

yrand_val['troll'] = yrand_val['troll'].apply(bool_to_bin)
yrand_test['troll'] = yrand_test['troll'].apply(bool_to_bin)

ytemp_train['troll'] = ytemp_train['troll'].apply(bool_to_bin)
ytemp_val['troll'] = ytemp_val['troll'].apply(bool_to_bin)
ytemp_test['troll'] = ytemp_test['troll'].apply(bool_to_bin)

Concatenate features

def concatenate_features(tok_matrix, data_df):
    """ concatenate the tokenized matrix (scipy.sparse) with other features """
    sparse_cols = sparse.csr_matrix(data_df[['followers', 'following', 'retweet']])
    combined = sparse.hstack([tok_matrix, sparse_cols])
    return combined

Xrand_train_combined = concatenate_features(Xrand_train_tok, Xrand_train)
Xrand_val_combined = concatenate_features(Xrand_val_tok, Xrand_val)
Xrand_test_combined = concatenate_features(Xrand_test_tok, Xrand_test)

Xtemp_train_combined = concatenate_features(Xtemp_train_tok, Xtemp_train)
Xtemp_val_combined = concatenate_features(Xtemp_val_tok, Xtemp_val)
Xtemp_test_combined = concatenate_features(Xtemp_test_tok, Xtemp_test)

Train the model(s)

Using only text

# random split
svm_model = svm.SVC().fit(Xrand_train_tok, yrand_train['troll'])

svm_model.score(Xrand_train_tok, yrand_train['troll'])

0.8563023725296331

svm_model.score(Xrand_val_tok, yrand_val['troll'])

0.8545563909774436

svm_model.score(Xrand_test_tok, yrand_test['troll'])

0.8563652220985835

# temporal split
svm_temp = svm.SVC().fit(Xtemp_train_tok, ytemp_train['troll'])

svm_temp.score(Xtemp_val_tok, ytemp_val['troll'])

0.8595187969924812

svm_temp.score(Xtemp_test_tok, ytemp_test['troll'])

0.8649664671739196

Using all features

# random split
svm_rand_all = svm.SVC().fit(Xrand_train_combined, yrand_train['troll'])

svm_rand_all.score(Xrand_train_combined, yrand_train['troll'])

0.9306173238647685

svm_rand_all.score(Xrand_test_combined, yrand_test['troll'])

0.928693873868455

# temporal split
svm_temp_all = svm.SVC().fit(Xtemp_train_combined, ytemp_train['troll'])

svm_temp_all.score(Xtemp_train_combined, ytemp_train['troll'])

svm_temp_all.score(Xtemp_test_combined, ytemp_test['troll'])

0.8745902378875823

SVM

Contents

Load Data and Train/Test Split(s)

Prepare feature matrix

Isolate matrices

Tokenize content

Standardize followers/following

Binarize the boolean outcome

Concatenate features

Train the model(s)

Using only text

Using all features