import os import sys print(sys.path) sys.path.append('c:/Users/jdk450/Python/') import emailReadUtility import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.svm import SVC # Importing SVM from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score from sklearn.metrics import classification_report import matplotlib.pyplot as plt import seaborn as sns # Make sure you know the directory where you are and locate trec07p in the appropriate directory os.getcwd() DATA_DIR = 'c:/Users/jdk450/Python/Data/trec07p/data/' LABELS_FILE = 'c:/Users/jdk450/Python/Data/trec07p/full/index' TESTING_SET_RATIO = 0.2 labels = {} # Read the labels with open(LABELS_FILE) as f: for line in f: line = line.strip() label, key = line.split() labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0 def read_email_files(): X = [] y = [] for i in range(len(labels)): filename = 'inmail.' + str(i+1) email_str = emailReadUtility.extract_email_text( os.path.join(DATA_DIR, filename)) X.append(email_str) y.append(labels[filename]) return X, y X, y = read_email_files() # Take a look at X and y pd.DataFrame(X).head() pd.DataFrame(y).head() X_train, X_test, y_train, y_test, idx_train, idx_test = \ train_test_split(X, y, range(len(y)), test_size=TESTING_SET_RATIO, random_state=2) vectorizer = TfidfVectorizer() X_train_vector = vectorizer.fit_transform(X_train) X_test_vector = vectorizer.transform(X_test) # Initialize the SVM classifier, train, and make predictions svm_clf = SVC(kernel='linear') # Using SVM with a linear kernel svm_clf.fit(X_train_vector, y_train) y_pred = svm_clf.predict(X_test_vector) # Get confusion matrix cnf_matrix = confusion_matrix(y_test, y_pred) # Show confusion matrix print(cnf_matrix) # Compute and Print performance metrics print('Classification accuracy {:.1%}'.format(accuracy_score(y_test, y_pred))) print("Precision:", precision_score(y_test, y_pred)) print("Recall:", recall_score(y_test, y_pred)) print("F1-Score", f1_score(y_test, y_pred)) print(classification_report(y_test, y_pred)) # Create a heatmap class_names = [0,1] # Name of classes fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show()