Twitter Troll Detection
| EDA
| Cleaning
| Sentence Embeddings
| Naive Bayes
| Logistic Regression
| SVM
| TwitterNet
SVM
Contents
import pandas as pd
import numpy as np
from sklearn import feature_extraction
from sklearn import svm
from sklearn import preprocessing
from scipy import sparse
Load Data and Train/Test Split(s)
df = pd . read_json ( "../data/merged_troll_data.json" )
content
followers
following
retweet
account_category
created_at
troll
orig_index
307969
@AIIAmericanGirI @CommonSense1212 @realDonaldT...
2267
219
0
NonTroll
2016-11-03 14:35:17
False
31589
135312
'\| ̄ ̄ ̄ ̄ ̄ ̄\| \| 33 \...
629
364
1
LeftTroll
2016-10-06 13:33:00
True
331179
82234
'@WOOKIE318 I hope you also pissed off when Cl...
12321
9091
0
RightTroll
2016-09-08 01:39:00
True
252179
300364
RT @HillaryClinton: "Everything I’ve done star...
3228
3703
1
NonTroll
2016-11-04 03:02:13
False
5521
37315
"Understanding will never bring you peace. Tha...
542
686
1
LeftTroll
2016-08-07 13:32:00
True
303267
ids = pd . read_json ( "../data/train_test_inds.json" )
Prepare feature matrix
Isolate matrices
def getxy ( ids , feature_cols = [ 'content' , 'followers' , 'following' , 'retweet' ], label_col = [ 'troll' ]):
return df [ feature_cols ] . iloc [ ids ], df [ label_col ] . iloc [ ids ]
# random
Xrand_train , yrand_train = getxy ( ids . random . train )
Xrand_val , yrand_val = getxy ( ids . random . val )
Xrand_test , yrand_test = getxy ( ids . random . test )
# temporal
Xtemp_train , ytemp_train = getxy ( ids . temporal . train )
Xtemp_val , ytemp_val = getxy ( ids . temporal . val )
Xtemp_test , ytemp_test = getxy ( ids . temporal . test )
content
followers
following
retweet
204024
RT @businessinsider: OBAMA: The press doesn’t ...
14525
3311
1
45854
Review: Generation Startup https://t.co/lej8O8...
3086
2387
1
199686
RT @Kidrambler: @TomiLahren Vote for Gary John...
1117
3742
1
115712
in interpersonal relations with pple who are m...
936
582
1
245728
RT @PeterTownsend7: The Real #WarOnWomen #isi...
2891
1615
1
Xrand_train . shape , yrand_train . shape
((266003, 4), (266003, 1))
Tokenize content
vocab_size = 5000
tokenizer = feature_extraction . text . CountVectorizer ( stop_words = 'english' , max_features = vocab_size )
tokenizer = tokenizer . fit ( df [ 'content' ])
Xrand_train_tok = tokenizer . transform ( Xrand_train [ 'content' ])
Xrand_val_tok = tokenizer . transform ( Xrand_val [ 'content' ])
Xrand_test_tok = tokenizer . transform ( Xrand_test [ 'content' ])
Xtemp_train_tok = tokenizer . transform ( Xtemp_train [ 'content' ])
Xtemp_val_tok = tokenizer . transform ( Xtemp_val [ 'content' ])
Xtemp_test_tok = tokenizer . transform ( Xtemp_test [ 'content' ])
Xrand_train_tok . shape # token matrix dim = n x vocab_size
Standardize followers/following
# one for each split
rand_scaler = preprocessing . StandardScaler () . fit ( Xrand_train [[ 'followers' , 'following' ]])
temp_scaler = preprocessing . StandardScaler () . fit ( Xtemp_train [[ 'followers' , 'following' ]])
print ( 'rand means and scales: {}, {}' . format ( rand_scaler . mean_ , rand_scaler . scale_ ))
print ( 'temp means and scales: {}, {}' . format ( temp_scaler . mean_ , rand_scaler . scale_ ))
rand means and scales: [8154.90645218 3016.03233422], [219679.05451009 7816.52064337]
temp means and scales: [8757.68069533 3020.22409146], [219679.05451009 7816.52064337]
They are very close. Could probably just use a single one, but I will use both anyways, in case it makes a difference.
col_to_std = [ 'followers' , 'following' ]
Xrand_train [ col_to_std ] = rand_scaler . transform ( Xrand_train [ col_to_std ])
Xrand_val [ col_to_std ] = rand_scaler . transform ( Xrand_val [ col_to_std ])
Xrand_test [ col_to_std ] = rand_scaler . transform ( Xrand_test [ col_to_std ])
Xtemp_train [ col_to_std ] = temp_scaler . transform ( Xtemp_train [ col_to_std ])
Xtemp_val [ col_to_std ] = temp_scaler . transform ( Xtemp_val [ col_to_std ])
Xtemp_test [ col_to_std ] = temp_scaler . transform ( Xtemp_test [ col_to_std ])
Xrand_train [ col_to_std ] . head ()
followers
following
204024
0.028997
0.037736
45854
-0.023074
-0.080475
199686
-0.032037
0.092876
115712
-0.032861
-0.311396
245728
-0.023962
-0.179240
Binarize the boolean outcome
troll
204024
False
45854
True
199686
False
115712
True
245728
False
bool_to_bin = lambda x : 1 if x else 0
yrand_train [ 'troll' ] = yrand_train [ 'troll' ] . apply ( bool_to_bin )
yrand_train . head ()
troll
204024
0
45854
1
199686
0
115712
1
245728
0
yrand_val [ 'troll' ] = yrand_val [ 'troll' ] . apply ( bool_to_bin )
yrand_test [ 'troll' ] = yrand_test [ 'troll' ] . apply ( bool_to_bin )
ytemp_train [ 'troll' ] = ytemp_train [ 'troll' ] . apply ( bool_to_bin )
ytemp_val [ 'troll' ] = ytemp_val [ 'troll' ] . apply ( bool_to_bin )
ytemp_test [ 'troll' ] = ytemp_test [ 'troll' ] . apply ( bool_to_bin )
Concatenate features
def concatenate_features ( tok_matrix , data_df ):
""" concatenate the tokenized matrix (scipy.sparse) with other features """
sparse_cols = sparse . csr_matrix ( data_df [[ 'followers' , 'following' , 'retweet' ]])
combined = sparse . hstack ([ tok_matrix , sparse_cols ])
return combined
Xrand_train_combined = concatenate_features ( Xrand_train_tok , Xrand_train )
Xrand_val_combined = concatenate_features ( Xrand_val_tok , Xrand_val )
Xrand_test_combined = concatenate_features ( Xrand_test_tok , Xrand_test )
Xtemp_train_combined = concatenate_features ( Xtemp_train_tok , Xtemp_train )
Xtemp_val_combined = concatenate_features ( Xtemp_val_tok , Xtemp_val )
Xtemp_test_combined = concatenate_features ( Xtemp_test_tok , Xtemp_test )
Train the model(s)
Using only text
# random split
svm_model = svm . SVC () . fit ( Xrand_train_tok , yrand_train [ 'troll' ])
svm_model . score ( Xrand_train_tok , yrand_train [ 'troll' ])
svm_model . score ( Xrand_val_tok , yrand_val [ 'troll' ])
svm_model . score ( Xrand_test_tok , yrand_test [ 'troll' ])
# temporal split
svm_temp = svm . SVC () . fit ( Xtemp_train_tok , ytemp_train [ 'troll' ])
svm_temp . score ( Xtemp_val_tok , ytemp_val [ 'troll' ])
svm_temp . score ( Xtemp_test_tok , ytemp_test [ 'troll' ])
Using all features
# random split
svm_rand_all = svm . SVC () . fit ( Xrand_train_combined , yrand_train [ 'troll' ])
svm_rand_all . score ( Xrand_train_combined , yrand_train [ 'troll' ])
svm_rand_all . score ( Xrand_test_combined , yrand_test [ 'troll' ])
# temporal split
svm_temp_all = svm . SVC () . fit ( Xtemp_train_combined , ytemp_train [ 'troll' ])
svm_temp_all . score ( Xtemp_train_combined , ytemp_train [ 'troll' ])
svm_temp_all . score ( Xtemp_test_combined , ytemp_test [ 'troll' ])