sms_spam_detector_17.py
1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud
def train_testgit_split(X, Y, test_size):
test_size = int(test_size*X.shape[0])
Xtrain = X[:-test_size]
Xtest = X[-test_size:]
Ytrain = Y[:-test_size]
Ytest = Y[-test_size:]
return Xtrain, Xtest, Ytrain, Ytest
def visualize(label):
words = ''
for msg in df[df['labels'] == label]['data']:
msg = msg.lower()
words += msg + ' '
word_cloud = WordCloud(width=600, height=400).generate(words)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
df = pd.read_csv('./files/sms_spam.csv', encoding='ISO-8859-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns = ['labels', 'data']
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values
count_vectorizer = CountVectorizer(decode_error='ignore')
X = count_vectorizer.fit_transform(df['data'])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print('Train score is', model.score(Xtrain, Ytrain))
print('Test score is', model.score(Xtest, Ytest))
visualize('spam')
visualize('ham')
df['predictions'] = model.predict(X)
sneaky_spam = df[(df['b_labels'] == 1) & (df['predictions'] == 0)]['data']
for msg in sneaky_spam:
print(msg)
print('\n\n')
not_actually_spam = df[(df['b_labels'] == 0) & df['predictions'] == 1]['data']
for msg in not_actually_spam:
print(msg)