"""
=======================================================================================
Try out following supervised classification techniques on onion sites (documents):
1. Bernoulli Naive Bayesian Classifier
2. Multinomial Naive Bayesian Classifier
3. SVM
4. Linear Classifer with SGD
5. k-Nearest Neighbor classifier
6. C4.5 Decision Tree
7. Random Forest Classifier
8. Ridge Classifier
9. Nearest Centroid classifier

How to construct input feature vectors?
-> We support the following:
0. Bag of Words / Counting Vector / Term frequency technique 
1. TFIDF model (where each term frequency is also weighted with inverse document frequency)
2. Bag of Words / Counting Vector / Term frequency technique  [WITH STOP WORDS FILTERING]
3. TFIDF model (where each term frequency is also weighted with inverse document frequency) [WITH STOP WORDS FILTERING]

Then compare them with ATOL-TFICF accuracy statistics.
=======================================================================================

"""

"""
	Usage: python [scriptname].py [stopwords file] [Input path to data folder] [input vector construction option] [optional: ngram lower limit] [optional: ngram upper limit]
	
	The data folder should contain subfolders, each of them are named according to the categories/classes.
	Each subfolder contains the documents/onion sites that have been manually classified as the said category.

	[input vector construction option] can be 0,1,2,3 (each representing the corresponding strategies, as discussed above)

	ngram lower and upper limits are optional; 
	but if one of them is provided the other must be provided,
	otherwise we completely ignore the ngram model.
	If none is provided we follow simple bag-of-words model, where position of the words does not matter.

	Example: python supervised_benchmark.py stopwords.txt ../input/input-data 0 > result.bow

DATE: 2.13.2017
 - Adding precision, recall, F1 computation
"""

#from __future__ import print_function
from time import time

#import os
import sys
import numpy as np
#import pylab as pl

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.utils.extmath import density
from scipy import sparse



num_classes = 3

#Reading stopwords file and creating the stopwords list
swl = []
sw_file = sys.argv[1]
f = open(sw_file, 'r')
lines = f.readlines()
for line in lines:
        if line.startswith('#'):
            continue
	stopword = line.rstrip('\n')
        swl.append(stopword)
print("Stopwords list created...")

print("Loading dataset...")
the_data_folder = sys.argv[2]
print("%s" % (the_data_folder))

t0 = time()
dataset = load_files(the_data_folder, shuffle=False)
data_samples = dataset.data
print("done in %0.3fs." % (time() - t0))
print("n_samples: %d" % len(data_samples))

ip_vector = sys.argv[3]

if len(sys.argv) < 6:		# simple monogram or bag of words model
   if ip_vector == str(0):
   	t0 = time()
   	bow_vectorizer = CountVectorizer(max_df=1.00, min_df=0, decode_error='ignore')
   	transfrmed_input = bow_vectorizer.fit_transform(data_samples)
   	print("BOW without stop words done in %0.3fs." % (time() - t0))
   elif ip_vector == str(1):
   	t0 = time()
   	tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=0) #max_features=n_features,
   	transfrmed_input = tfidf_vectorizer.fit_transform(data_samples, decode_error='ignore')
   	print("TFIDF without stop words done in %0.3fs." % (time() - t0))
   elif ip_vector == str(2):
   	t0 = time()
   	bow_vectorizer = CountVectorizer(max_df=1.00, min_df=0, stop_words=swl, decode_error='ignore')
                                     # stop_words='english') #not using the default stopwords list
   	transfrmed_input = bow_vectorizer.fit_transform(data_samples)
   	print("BOW with stop words done in %0.3fs." % (time() - t0))
   elif ip_vector == str(3):
   	t0 = time()
   	tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=0, 		#max_features=n_features,
                                      stop_words=swl, decode_error='ignore') 	#stop_words='english')
   	transfrmed_input = tfidf_vectorizer.fit_transform(data_samples)
   	print("TFIDF with stop words done in %0.3fs." % (time() - t0))
   else:
   	print "Invalid input vector creation option! See Usage & Try Again!"
   	exit(0)
else:		# n-gram model
   if ip_vector == str(0):
   	t0 = time()
   	bow_vectorizer = CountVectorizer(max_df=1.00, min_df=0, ngram_range=(int(sys.argv[4]), int(sys.argv[5])), decode_error='ignore')
   	transfrmed_input = bow_vectorizer.fit_transform(data_samples)
   	print("BOW without stop words done in %0.3fs." % (time() - t0))
   elif ip_vector == str(1):
   	t0 = time()
   	tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=0, ngram_range=(int(sys.argv[4]), int(sys.argv[5])), decode_error='ignore') #max_features=n_features,
   	transfrmed_input = tfidf_vectorizer.fit_transform(data_samples)
   	print("TFIDF without stop words done in %0.3fs." % (time() - t0))
   elif ip_vector == str(2):
   	t0 = time()
   	bow_vectorizer = CountVectorizer(max_df=1.00, min_df=0, stop_words=swl, ngram_range=(int(sys.argv[4]), int(sys.argv[5])), decode_error='ignore')
                                     # stop_words='english') #not using the default stopwords list
   	transfrmed_input = bow_vectorizer.fit_transform(data_samples)
   	print("BOW with stop words done in %0.3fs." % (time() - t0))
   elif ip_vector == str(3):
   	t0 = time()
   	tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=0, 			#max_features=n_features,
                                      stop_words=swl, ngram_range=(int(sys.argv[4]), int(sys.argv[5])), decode_error='ignore') #stop_words='english')
   	transfrmed_input = tfidf_vectorizer.fit_transform(data_samples)
   	print("TFIDF with stop words done in %0.3fs." % (time() - t0))
   else:
   	print "Invalid input vector creation option! See Usage & Try Again!"
   	exit(0)


# split the dataset in training and test set:
#X_train, X_test, y_train, y_test = train_test_split(transfrmed_input, dataset.target, test_size=0.25, random_state=42) #random_state=None)

#Use the splitting used in TFICF
#
t0 = time()

pts, dims = transfrmed_input.shape

test_pts = 0
for x in range(0, pts):
  if dataset.target[x] < num_classes:
     test_pts+=1

train_pts = pts - test_pts

X_train_ = np.zeros((train_pts, dims))
X_train = sparse.lil_matrix(X_train_)
X_test_ = np.zeros((test_pts, dims))
X_test = sparse.lil_matrix(X_test_)

y_train = []
y_test = []

p = 0
q = 0
for i in range(0, pts):
  if dataset.target[i] >= num_classes:	# this means its part of the training set
     X_train[p, :] = transfrmed_input[i, :]
     y_train.append(dataset.target[i] - num_classes)	# delete the label offset
     p += 1
  else:
     X_test[q, :] = transfrmed_input[i, :]
     y_test.append(dataset.target[i]) 
     q += 1

print("test train split done in %0.3fs." % (time() - t0))

#print("** Total number of features to work with %s.\n\n" %(dims))

# A quick sanity check
# ====================
#trD=0
#trH=0
#trW=0
#
#teD=0
#teH=0
#teW=0
#
#for i in range(0, train_pts):
#	if y_train[i] == 0:
#		trD += 1
#	elif y_train[i] == 1:
#		trH += 1
#	else:
#		trW += 1
#
#for i in range(0, test_pts):
#	if y_test[i] == 0:
#		teD += 1
#	elif y_test[i] == 1:
#		teH += 1
#	else:
#		teW += 1
#
#print("=== Quick Sanity Test ===")
#print("* Drug labeled training pts: %s" % (trD))
#print("* Hacker labeled training pts: %s" % (trH))
#print("* Weapon labeled training pts: %s" % (trW))
#
#print("* Drug labeled test pts: %s" % (teD))
#print("* Hacker labeled test pts: %s" % (teH))
#print("* Weapon labeled test pts: %s" % (teW))
#print("=== *** ===")


###############################################################################
# Benchmark classifiers
###############################################################################
def benchmark(clf_class, params, name):
    print("parameters:", params)
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print("Model built in %fs" % (time() - t0))

    #if hasattr(clf, 'coef_'):
    #    print("Percentage of non zeros coef: %f"
    #          % (np.mean(clf.coef_ != 0) * 100))

    #print("Predicting the outcomes on the test set...")
    t0 = time()
    pred = clf.predict(X_test)
    print("Testing done in %fs" % (time() - t0))

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    print("Classification report on test set for classifier:")
    print(clf)
    print()
    print(classification_report(y_test, pred,
                                target_names=dataset.target_names))

    cm = confusion_matrix(y_test, pred)
    print("Confusion matrix:")
    print(cm)

    pr = precision_score(y_test, pred, average="weighted")
    re = recall_score(y_test, pred, average="weighted")
    f1 = f1_score(y_test, pred, average="weighted")
    print("Precision: %0.3f, Recall: %0.3f, F1: %0.3f " % (pr, re, f1))

print("\n\n*** ================================= *** \n")
print("<1> Testbenching a linear classifier with SGD...")
parameters = {
    'loss': 'hinge',
    'penalty': 'l2',
    'n_iter': 50,
    'alpha': 0.00001,
    'fit_intercept': True,
}

benchmark(SGDClassifier, parameters, 'SGD')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<2> Testbenching a MultinomialNB classifier...")
parameters = {'alpha': 0.01}

benchmark(MultinomialNB, parameters, 'MultinomialNB')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<3> Testbenching a BernoulliNB classifier...")
parameters = {'alpha': 0.01}

benchmark(BernoulliNB, parameters, 'BernoulliNB')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<4> Testbenching a Ridge classifier...")
parameters = {
        'tol': 1e-2, 
        'solver': 'lsqr', 
}

benchmark(RidgeClassifier, parameters, 'Ridge Classifier')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<5> Testbenching a kNN classifier...")
parameters = {
        'n_neighbors': 10, 
}

benchmark(KNeighborsClassifier, parameters, 'k-Nearest Neighbor Classifier')
print("*** ================================= *** \n")

#Commenting out SVM...
#
LinearSVC(penalty="l1", dual=False, tol=1e-3)

print("\n\n*** ================================= *** \n")
print("<6> Testbenching a SVM classifier...")
parameters = {
	'tol': 1e-3,
}

benchmark(LinearSVC, parameters, 'Linear SVM')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<7> Testbenching a Nearest Centroid/Rochio classifier (kind of clustering)...")
parameters = {}

benchmark(NearestCentroid, parameters, 'Nearest Centroid')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<8> Testbenching a RandomForest classifier...")
parameters = {
	#'n_estimators': 50,
	'max_features': None,
}

benchmark(RandomForestClassifier, parameters, 'Random Forest')
print("*** ================================= *** \n")


print("\n\n*** ================================= *** \n")
print("<9> Testbenching a DecisionTree classifier...")
parameters = {}

benchmark(DecisionTreeClassifier, parameters, 'Decision Tree')
print("*** ================================= *** \n")

