#!/usr/bin/python
#
#======================================================================================================================================
# This script creates input feature vectors following ATOL's TFICF model, as explained below:
#
# * ATOL TFICF works as follows:
#
# 	- 1. For each category, calculate the weights of the keywords (this is actually the term frequency in a category multiplied by the inverse frequency across all the categories)
# 	- 2. Apply cosine similarity w.r.t the document. Here the document is represented as a Bag of Words (where each word feature also includes the number of times this term is included in the document). So, basically the number of times this word appeared * [its weight from the TFICF model]
#
# This script wants to use the input feature vectors generated from (2) for training some additional supervised ML classifiers.
# The classifiers are straightforward used from python scikit package.
#
# * USAGE: Run it exactly with the same arguments as the 'baseline' script. 
#
#
#======================================================================================================================================

import getopt, glob, math, os, sys
from collections import defaultdict

from time import time

#import os
import sys
import numpy as np

from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy import sparse



kEpsilon = 0.0000000001  # Small number to add to denominator, to prevent /0.

kMinKeywordLength = 3  # Prune out keywords with less than this lengh.
kMinDocSize = 10  # Onions with less than this #unique words are ignored.
kMaxVecSize = 50  # Max size of keyword vector

# Multiplier weights, used to accumulate score in 'count' variable in
# ProcessFilesInCategory function
#
kTitleMultiplier = 10  # Scaling up factor for title keywords.
kKeywordMultiplier = 100  # Scaling up factor for existing keywords.
kPageMultiplier = 1  # Scaling up factor for pages
kRatersMultiplier = 1.5  # Scaling up factor for human labels

# Main driver function.
def main(argv):

    train_label_file, wordgrp_dir, keywords_file, index_file, test_label_file, stopwords_file, baseline_label_file, mode, dedup = ProcessArguments(argv)
    (L, K, T, M, Mt, data, test, H, B) = CreateHashes(train_label_file,
                                                      wordgrp_dir, 
                                                      keywords_file,
                                                      index_file,
                                                      test_label_file,
                                                      stopwords_file,
                                                      baseline_label_file
    )

    # Deduplicate if necessary.
    if dedup:
        print '\nDeduplicating data and test.'
        print 'Data size before deduplication: ' + str(len(data.keys()))
        data = DedupData(data, T)
        print 'Data size after deduplication: ' + str(len(data.keys()))
        print 'Test size before deduplication: ' + str(len(test.keys()))
        test = DedupTest(test, T)
        print 'Test size after deduplication: ' + str(len(test.keys()))

    # Unique categories in labeled data.
    categories = list(set([item for sublist in L.values() for item in sublist]))

    ##### Original data #####

    # Compute baseline and TFICF keywords.
    K1 = TransformHashFormat(K)
    t0 = time()
    keywords = ComputeTFICF(M, Mt, K, categories)
    print("Keywords TFICF conversion done in %0.3fs." % (time() - t0))

    global X_train, y_train, X_test, y_test

    t0 = time()
    X_train, y_train = tficfFeatureVectors(data, L, keywords, categories)
    print("TFICF training input done in %0.3fs." % (time() - t0))

    t0 = time()
    X_test, y_test = bowFeatureVectors(data, test, keywords, categories)
    print("BoW test input done in %0.3fs." % (time() - t0))

    #RunInference(data, keywords, categories, clf)

    lr_results = []
    sgd_results = []
    nb_results = []

    for n in range(0,5):

	    print("\n\n*** ================================= *** \n")
	    print("<0> Testbenching logistic regression classifier ...")
	    parameters = {
 		'solver': 'sag',
 		'penalty': 'l2',
            }

	    lr_results.append(benchmark(LogisticRegression, parameters, 'Logistic Regression'))
	    print("*** ================================= *** \n")

	    print("\n\n*** ================================= *** \n")
	    print("<1> Testbenching a linear classifier with SGD...")
	    parameters = {
	      'loss': 'hinge',
	      'penalty': 'l2',
	      'n_iter': 50,
	      'alpha': 0.00001,
	      'fit_intercept': True,
	    }
	
	    sgd_results.append(benchmark(SGDClassifier, parameters, 'SGD'))
	    print("*** ================================= *** \n")
	
	   
	    print("\n\n*** ================================= *** \n")
	    print("<2> Testbenching a MultinomialNB classifier...")
	    parameters = {'alpha': 0.01}
	    
	    nb_results.append(benchmark(MultinomialNB, parameters, 'MultinomialNB'))
	    print("*** ================================= *** \n")
	    
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<3> Testbenching a BernoulliNB classifier...")
	    #parameters = {'alpha': 0.01}
	    #
	    #benchmark(BernoulliNB, parameters, 'BernoulliNB')
	    #print("*** ================================= *** \n")
	    
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<4> Testbenching a Ridge classifier...")
	    #parameters = {
	    #        'tol': 1e-2, 
	    #        'solver': 'lsqr', 
	    #}
	    
	    #benchmark(RidgeClassifier, parameters, 'Ridge Classifier')
	    #print("*** ================================= *** \n")
	    
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<5> Testbenching a kNN classifier...")
	    #parameters = {
	    #        'n_neighbors': 10, 
	    #}
	    
	    #benchmark(KNeighborsClassifier, parameters, 'k-Nearest Neighbor Classifier')
	    #print("*** ================================= *** \n")
	    
	    #Commenting out SVM...
	    #
	    #LinearSVC(penalty="l1", dual=False, tol=1e-3)
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<6> Testbenching a SVM classifier...")
	    #parameters = {
	    #	'tol': 1e-3,
	    #}
	    
	    #benchmark(LinearSVC, parameters, 'Linear SVM')
	    #print("*** ================================= *** \n")
	    
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<7> Testbenching a Nearest Centroid/Rochio classifier (kind of clustering)...")
	    #parameters = {}
	    
	    #benchmark(NearestCentroid, parameters, 'Nearest Centroid')
	    #print("*** ================================= *** \n")
	    
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<8> Testbenching a RandomForest classifier...")
	    #parameters = {
	    	#'n_estimators': 50,
	    	#'max_features': None,
	    #}
	    
	    #benchmark(RandomForestClassifier, parameters, 'Random Forest')
	    #print("*** ================================= *** \n")
	    
	    
	    #print("\n\n*** ================================= *** \n")
	    #print("<9> Testbenching a DecisionTree classifier...")
	    #parameters = {}
	    
	    #benchmark(DecisionTreeClassifier, parameters, 'Decision Tree')
	    #print("*** ================================= *** \n")

            print("\nLR results over all runs:")
            print(lr_results)
            lr_accuracy = sum(float(l[0]) for l in lr_results)/5
            lr_precision = sum(float(l[1]) for l in lr_results)/5
            lr_recall = sum(float(l[2]) for l in lr_results)/5
            lr_f1 = sum(float(l[3]) for l in lr_results)/5
            print("Avg. LR accuracy=%0.3f, precision=%0.3f, recall=%0.3f, f1=%0.3f)" % (lr_accuracy, lr_precision, lr_recall, lr_f1))

            print("\nSGD results over all runs:")
            print(sgd_results)
            sgd_accuracy = sum(float(l[0]) for l in sgd_results)/5
            sgd_precision = sum(float(l[1]) for l in sgd_results)/5
            sgd_recall = sum(float(l[2]) for l in sgd_results)/5
            sgd_f1 = sum(float(l[3]) for l in sgd_results)/5
            print("Avg. SGD accuracy=%0.3f, precision=%0.3f, recall=%0.3f, f1=%0.3f)" % (sgd_accuracy, sgd_precision, sgd_recall, sgd_f1))

            print("\nNB results over all runs:")
            print(nb_results)
            nb_accuracy = sum(float(l[0]) for l in nb_results)/5
            nb_precision = sum(float(l[1]) for l in nb_results)/5
            nb_recall = sum(float(l[2]) for l in nb_results)/5
            nb_f1 = sum(float(l[3]) for l in nb_results)/5
            print("Avg. NB accuracy=%0.3f, precision=%0.3f, recall=%0.3f, f1=%0.3f)" % (nb_accuracy, nb_precision, nb_recall, nb_f1))
            

# Dedup data defaultdict.
def DedupData(data, T):
    dedup_data = defaultdict(lambda:defaultdict(int))
    # Create transpose of title hash T.
    Tt = {}
    for (onion, title) in T.items():
        title_str = ' '.join(title)
        if Tt.has_key(title_str):
            Tt[title_str].append(onion)
        else:
            Tt[title_str] = [onion]
    # Create map of title to onion with largest word list.
    Tmax = {}
    for (title_str, onions) in Tt.items():
        selected_onion = 0
        max_value = 0
        for onion in onions:
            if len(data[onion]) > max_value:
                max_value = len(data[onion])
                selected_onion = onion
        Tmax[selected_onion] = title_str
    for (onion, val) in data.items():
        if Tmax.has_key(onion):
            dedup_data[onion] = val
    return dedup_data


# Dedup test hash.
def DedupTest(test, T):
    dedup_test = {}
    testTt = {}
    for onion in test.keys():
        add_onion = True
        if T.has_key(onion):
            title_str = ' '.join(T[onion])
            if testTt.has_key(title_str):
                add_onion = False
            else:
                testTt[title_str] = onion
        if add_onion:
            dedup_test[onion] = test[onion]
    return dedup_test


# Add 1.0 weights to the list in the hash val, for format compatibility.
def TransformHashFormat(K):
    K1 = {}
    for key in K.keys():
        lst = []
        for val in K[key]:
            lst.append((val, 1.0))
        K1[key] = lst
    # print 'K = ' + str(K)
    # print 'K1 = ' + str(K1)
    return K1


# This function returns 9 hashes:
#   1) L, mapping onion -> category list. (From train_label_file)
#   2) T, mapping onion -> list of title words. (From index_file, limited to onions in train_label_file)
#   3) K, mapping category -> list of keywords. (From keywords_file, limited to categories in train_label_file)
#   4) M, mapping category x keyword -> count. (From words, train_label_file, T and K).
#   5) Mt, Mt is the transpose of M. 
#   6) data, mapping onion x keyword -> count. (From words)
#   7) test, mapping onion -> category list. (From test_label_file)
#   8) H mapping onion -> category list. (From index_file)
#   9) B mapping onion -> category list. (From baseline label file)
def CreateHashes(train_label_file, wordgrp_dir, keywords_file,
                 index_file, test_label_file, stopwords_file,
                 baseline_label_file):
    L = CreateL(train_label_file)
    test = CreateL(test_label_file)
    K = CreateK(keywords_file, L)
    data = CreateData(wordgrp_dir)
    S = CreateS(stopwords_file)
    T = CreateT(index_file, S)
    H = CreateH(index_file)
    B = CreateB(baseline_label_file)
    (M, Mt) = CreateM(wordgrp_dir, L, T, K, S, H, test)
    return (L, K, T, M, Mt, data, test, H, B)


# Create Hash S mapping word -> 1.
def CreateS(stopwords_file):
    S = {}
    index = os.getcwd() + '/' + stopwords_file
    print 'Processing stopwords file: ' + stopwords_file
    f = open(index, 'r')
    lines = f.readlines()
    for line in lines:
        # Ignore comments.
        if line.startswith('#'):
            continue
        stopword = line.rstrip('\n')
        # print '--> Line: ' + stopword
        if not S.has_key(stopword):
            S[stopword] = 1
#    print 'S hash is:'
#    for key in S.keys():
#        print 'Stopword: ' + key
    return S


# Create Hash L mapping onion -> category list.
def CreateL(label_file):
    L = {}
    index = os.getcwd() + '/' + label_file
    print 'Processing label file: ' + label_file
    f = open(index, 'r')
    lines = f.readlines()
    for line in lines:
        # Ignore comments.
        if line.startswith('#'):
            continue
        stripped_line = line.rstrip('\n')
        # print '--> Line: ' + stripped_line
        tokens = stripped_line.split(',')
        onion = tokens[0]
        cat_label = tokens[1]
        if len(cat_label) <= 1:
            # print 'Ignoring onion ' + onion + ' with empty categories.'
            continue
        else:
            if len(cat_label) > 1:
                # print 'Adding label: ' + cat_label + ' for onion: ' + onion
                if L.has_key(onion):
                    L[onion].append(cat_label)
                else:
                    L[onion] = [cat_label]
            # print 'Onion: ' + onion + ', category list: ' + str(L[onion])
#    print 'L hash is:'
#    for (key, val) in L.items():
#        print 'Onion: ' + key + ', categories: ' + str(val)
    return L


# Create Hash T mapping onion -> list of title words, where onion is in L.
def CreateT(index_file, S):
    T = {}
    index = os.getcwd() + '/' + index_file
    print 'Processing index file for titles: ' + index_file
    f = open(index, 'r')
    lines = f.readlines()
    for line in lines:
        # Ignore comments.
        if line.startswith('#'):
            continue
        stripped_line = line.rstrip('\n')
        # print '--> Line: ' + stripped_line
        tokens = stripped_line.split(',')
        onion = tokens[0]
        title_string = tokens[2]
        if len(title_string) <= 1:
            # print 'Ignoring onion ' + onion + ' with empty title.'
            continue
        else:
            title_words = title_string.split(' ')
            for title_word in title_words:
                # Ignore stop words in title.
                if S.has_key(title_word):
                    continue
                if len(title_word) > 1:
                    if T.has_key(onion):
                        T[onion].append(title_word)
                    else:
                        T[onion] = [title_word]
            # print 'Onion: ' + onion + ', title words: ' + str(T[onion])
#    print 'T hash is:'
#    for (key, val) in T.items():
#        print 'Onion: ' + key + ', title words: ' + str(val)
    return T


# Create Hash H mapping onion -> category list.
def CreateH(index_file):
    # From index_file, create hash H mapping onion -> category list.
    H = {}
    index = os.getcwd() + "/" + index_file
    print "Processing index file for original categories: " + index_file
    f = open(index, 'r')
    lines = f.readlines()
    for line in lines:
        # Ignore comments.
        if line.startswith("#"):
            continue
        stripped_line = line.rstrip("\n")
        # print "--> Line: " + stripped_line
        tokens = line.split(",")
        onion = tokens[0]
        category_string = tokens[10]
        if len(category_string) <= 1:
            # print "Ignoring onion with empty categories."
            continue;
        else:
            categories = category_string.split(";")
            for cat in categories:
                cat_tokens = cat.split("[")
                cat_label = cat_tokens[0]
                if len(cat_label) > 1:
                    # print "Adding label: " + cat_label + " for onion: " + onion
                    if H.has_key(onion):
                        H[onion].append(cat_label)
                        # print "Val: " + str(H[onion])
                    else:
                        H[onion] = [cat_label]
                        # print "Val: " + str(H[onion])
            # print "Onion: " + onion + ", category list: " + str(H[onion])
    return H


# Create Hash B mapping onion -> category list.
def CreateB(baseline_label_file):
    # From baseline_label_file, create hash B mapping onion -> category list.
    B = {}
    baseline = os.getcwd() + "/" + baseline_label_file
    print "Processing baseline label file for baseline categories: " + baseline_label_file
    f = open(baseline, 'r')
    lines = f.readlines()
    for line in lines:
        # Ignore comments.
        if line.startswith("#"):
            continue
        stripped_line = line.rstrip("\n")
        # print "--> Line: " + stripped_line
        #tokens = line.split(", ")
        tokens = line.split("; ")
        #print 'Tokens: ' + str(tokens)
        onion_tokens = tokens[0].split(".")
        onion = onion_tokens[0]
        category_string = tokens[1]
        if len(category_string) <= 1:
            # print "Ignoring onion with empty categories."
            continue;
        else:
            categories = category_string.split(";")
            for cat in categories:
                cat_tokens = cat.split("[")
                cat_label = cat_tokens[0]
                if len(cat_label) > 1:
                    # print "Adding label: " + cat_label + " for onion: " + onion
                    if B.has_key(onion):
                        B[onion].append(cat_label)
                        # print "Val: " + str(H[onion])
                    else:
                        B[onion] = [cat_label]
                        # print "Val: " + str(H[onion])
            # print "Onion: " + onion + ", category list: " + str(B[onion])
    return B


# Create Hash K mapping category -> list of keywords.
def CreateK(keywords_file, L):
    K = {}
    keywords = os.getcwd() + '/' + keywords_file
    print 'Processing keywords file: ' + keywords_file
    f = open(keywords, 'r')
    lines = f.readlines()
    for line in lines:
        # Ignore comments.
        if line.startswith('#'):
            continue
        stripped_line = line.rstrip('\n')
        # print '--> Line: ' + stripped_line
        tokens = stripped_line.split(',')
        cat_label = tokens[0]
        # Check if cat_label is in categories of labeled set, ignore if not.
        # Flatten list, make items unique.
        L_cats = list(set([item for sublist in L.values() for item in sublist]))
        # print 'Categories in L = ' + str(L_cats)
        if not cat_label in L_cats:
            # print 'Not processing category ' + cat_label + ', not in labeled data.'
            continue
        # else:
            # print 'Processing keyword of category with labels: ' + cat_label
        keywords = tokens[1:]
        kw_tokens = [x.strip() for x in keywords]
        # print 'Category: ' + cat_label + ', keywords: ' + str(kw_tokens)
        for kw in kw_tokens:
            if K.has_key(cat_label):
                K[cat_label].append(kw)
            else:
                K[cat_label] = [kw]
#    print 'K hash is:'
#    for (key, val) in K.items():
#        print 'Category: ' + key + ', kw list: ' + str(val)
    return K


# Create Hash data mapping onion x keyword -> count.
def CreateData(wordgrp_dir):
    # Create 2d hash data.
    data = defaultdict(lambda:defaultdict(int))
    data = ProcessFilesInDir(wordgrp_dir, data)
    return data


# Process files in directory to create dataset hash 'data'.
def ProcessFilesInDir(directory, data):
    # Read all filenames in directory.
    path = os.getcwd() + '/' + directory + '/*'
    filenames = glob.glob(path)
    counter = 0
    print 'Processing files for word lookup in dir: ' + str(path)
    for filename in filenames:
        counter += 1
        # Get onion from filename
        onion = filename[filename.rfind('/')+1:filename.find('.')]
        # print 'Extracted onion name: ' + onion
        f = open(filename, 'r')
        lines = f.readlines()
        for line in lines:
            # Ignore comments.
            if line.startswith('#'):
                continue
            stripped_line = line.rstrip('\n')
            tokens = stripped_line.split(',')
            word = tokens[0]
            count = int(tokens[1]) + kPageMultiplier * math.sqrt(int(tokens[2]))
            # print 'Line: ' + stripped_line + ' --> word: ' + word + ', count: ' + count + ', category: ' + str(cat_list)
            data[onion][word] += count
    return data


# Create Hash M mapping category x keyword -> count.
def CreateM(wordgrp_dir, L, T, K, S, H, tst):
    # Create 2d hashes M and Mt (Mt is the transpose of M).
    M = defaultdict(lambda:defaultdict(int))
    Mt = defaultdict(lambda:defaultdict(int))
    (M, Mt) = ProcessFilesInCategory(wordgrp_dir, L, T, K, M, Mt, S, H, tst)
    return (M, Mt)


# Process files in directory to create 2d hash M. Algorithm:
# 1. For onion O with category C:
#       1a. Add count of each word W to M[C][W].
#       1b. If W is a keyword for C, multiply count M[C][W] by kKeywordMultiplier.
# 2. For each word W in title of O with category C:
#       2a. Add kTitleMultiplier to existing count of M[C][W].
def ProcessFilesInCategory(directory, L, T, K, M, Mt, S, H, test):
    # Read all filenames in directory.
    path = os.getcwd() + '/' + directory + '/*'
    filenames = glob.glob(path)
    counter = 0
    print 'Processing files for category lookup in dir: ' + str(path)
    commonOnions = []
    for filename in filenames:
        counter += 1
        # Get onion from filename
        onion = filename[filename.rfind('/')+1:filename.find('.')]
        # print 'Extracted onion name: ' + onion
        
        # If this onion is not in Label set, ignore if not.
        cat_list = []
        cat_from_raters = False
        if not L.has_key(onion):
            # Getting original category list from index file.
            if H.has_key(onion):
                cat_list = H[onion]
        else:
            if test.has_key(onion):
                # print 'Onion ' + onion + ' is in intersection of training and test set.'
                commonOnions.append(onion)
                continue
            cat_list = L[onion]
            cat_from_raters = True
#            print 'Reading filename #' + str(counter) + ': '  + filename 
#            print 'Processing onion: ' + onion + ', in labeled set with categories: ' + str(cat_list)

        f = open(filename, 'r')
        lines = f.readlines()
        if len(lines) < kMinDocSize:
            continue
        else: 
            for line in lines:
                # Ignore comments.
                if line.startswith('#'):
                    continue
                stripped_line = line.rstrip('\n')
                tokens = stripped_line.split(',')
                word = tokens[0]
                # Ignore stop words from kw list.
                if S.has_key(word):
                    continue
                count = int(tokens[1])
                # print 'Line: ' + stripped_line + ' --> word: ' + word + ', count: ' + count + ', category: ' + str(cat_list)

                # Step 1.
                if len(cat_list) > 0:
                    count = float(count) / len(cat_list)
                for cat in cat_list:
                    if K.has_key(cat) and word in K[cat]:
                        count  *= kKeywordMultiplier
                    if cat_from_raters:
                        count *= kRatersMultiplier
                    M[cat][word]  += count
                    Mt[word][cat] += count

        # Step 2.
        if not T.has_key(onion):
#            print 'Onion: ' + onion + ' not a key in T_hash'
            continue
        for title_word in T[onion]:
            for cat in cat_list:
                M[cat][title_word]  += kTitleMultiplier
                Mt[title_word][cat] += kTitleMultiplier            

    commonCatCount = {}
    for onion in commonOnions:
        cat = L[onion]
        if commonCatCount.has_key(cat[0]):
            commonCatCount[cat[0]] += 1
        else:
            commonCatCount[cat[0]] = 1
#    print 'commonCatCount = ' + str(commonCatCount)
#    print '\n===== Hash M =====\n'
    # print str(M)
    return (M, Mt)


# This function computes the TFICF of the keywords in each category.
# For each category i, it computes the term frequency (TF) of the
# keywords in category i. It also computes the inverse class frequency
# (ICF) of each keyword j in category i. For each category i, it sorts
# the keywords using TF*ICF and outputs the resulting vector of
# category keywords with TFICF weights.
def ComputeTFICF(M, Mt, K, categories):
    ICF = {}
    all_cat = len(M.keys())
    for key in Mt.keys():
        key_cat = len(Mt[key])
        key_tficf = (all_cat + kEpsilon)/(key_cat + kEpsilon)
        ICF[key] = key_tficf
    cat_tficf = {}
    for cat in categories:
        kw_hash = M[cat]
        lst = []
        for kw in kw_hash.keys():
            icf = ICF[kw]
            tf  = kw_hash[kw]
            # print 'tf = ' + str(tf) + ', icf = ' + str(icf)
            tficf = math.sqrt(tf * icf)
            lst.append((kw, tficf))
        sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
        # Remove keywords having string length<= kMinKeywordLength.
        pruned_lst = [x for x in sorted_lst if len(x[0]) > kMinKeywordLength]
        cat_tficf[cat] = pruned_lst[0:kMaxVecSize]
    # cat_tficf = PostProcessHash(cat_tficf)
    return cat_tficf


#==================================================================================================================
#==================================================================================================================

#Function that runs inference on the entire dataset
#clf <- classifier
def RunInference(data, keywords_wt, categories, clf):
    all_kw = set([])
    for cat in categories:
        all_kw = all_kw.union(set(x for (x, w) in keywords_wt[cat]))        
    kw_feat_pos = list(all_kw)

    dims = len(kw_feat_pos)
    for onion in data.keys():
    	feat_vect = sparse.lil_matrix((1, dims))
	for k in kw_feat_pos:
        	if data[onion].has_key(k):
                	cnt = data[onion][k]	#frequency of term
                       	pos = kw_feat_pos.index(k)
			feat_vect[0, pos] = cnt	
	cls = clf.predict_proba(feat_vect)[0,:].tolist()
	prob = max(cls)
	cls_label = categories[cls.index(prob)]
	print("%s : %s Prob-Estimate: %0.4f" % (onion, cls_label, prob))
	sys.stdout.flush()

#==================================================================================================================
#==================================================================================================================


# Function to create TFICF input feature vectors and corresponding output labels.
def tficfFeatureVectors(bow, rdata, keywords_wt, categories):
    all_kw = set([])
    for cat in categories:
        all_kw = all_kw.union(set(x for (x, w) in keywords_wt[cat]))        
    kw_feat_pos = list(all_kw)

    pts = 0
    for (onion, label) in rdata.items():
	pts += len(label)
    dims = len(kw_feat_pos)

    feat_vect_ = np.zeros((pts, dims))
    feat_vect = sparse.lil_matrix(feat_vect_)
    y_output = []

    i = 0
    for (onion, label) in rdata.items():
	for j in range(0,len(label)):
		category = label[j]

		if keywords_wt.has_key(category):
        		for (k, w) in keywords_wt[category]:
                    		if bow[onion].has_key(k):
                        		cnt = bow[onion][k] 		#frequency of term
                        		val = cnt * w 			#frequency * weight
                        		pos = kw_feat_pos.index(k)
					feat_vect[i, pos] = val	

		cls_label = categories.index(category)
		y_output.append(cls_label)
		i += 1

    return (feat_vect, y_output)


#==================================================================================================================
#==================================================================================================================

# Function to create BoW input feature vectors and corresponding output labels.
def bowFeatureVectors(bow, rdata, keywords_wt, categories):
    all_kw = set([])
    for cat in categories:
        all_kw = all_kw.union(set(x for (x, w) in keywords_wt[cat]))        
    kw_feat_pos = list(all_kw)

    pts = 0
    for (onion, label) in rdata.items():
	pts += len(label)
    dims = len(kw_feat_pos)

    feat_vect_ = np.zeros((pts, dims))
    feat_vect = sparse.lil_matrix(feat_vect_)
    y_output = []

    i = 0
    for (onion, label) in rdata.items():
	for j in range(0,len(label)):
		category = label[j]

		if keywords_wt.has_key(category):		#sanity check -- not a completely unseen category
        		for k in all_kw:
                    		if bow[onion].has_key(k):
                        		cnt = bow[onion][k] 		#frequency of term
                        		pos = kw_feat_pos.index(k)
					feat_vect[i, pos] = cnt	

		cls_label = categories.index(category)
		y_output.append(cls_label)
		i += 1

    return (feat_vect, y_output)


###############################################################################
# Benchmark classifiers
###############################################################################
def benchmark(clf_class, params, name):
    print("parameters:", params)
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print("* %s Model built in %fs" % (clf_class, time() - t0))

    #if hasattr(clf, 'coef_'):
    #    print("Percentage of non zeros coef: %f"
    #          % (np.mean(clf.coef_ != 0) * 100))

    #print("Predicting the outcomes on the test set...")
    t0 = time()
    pred = clf.predict(X_test)
    print("%s Testing done in %fs" % (clf_class, time() - t0))

    acc = metrics.accuracy_score(y_test, pred)
    print("%s accuracy:   %0.3f" % (clf_class, acc))

    print("Classification report on test set for classifier:")
    print(clf)
    print()
    #print(classification_report(y_test, pred, 
                                #target_names=dataset.target_names))

    cm = confusion_matrix(y_test, pred)
    print("%s Confusion matrix:", clf_class)
    print(cm)

    pr = precision_score(y_test, pred, average="weighted")
    re = recall_score(y_test, pred, average="weighted")
    f1 = f1_score(y_test, pred, average="weighted")
    print("Precision: %0.3f, Recall: %0.3f, F1: %0.3f " % (pr, re, f1))

    return (acc, pr, re, f1)
    #return clf

#==================================================================================================================
#==================================================================================================================


# Post-process to remove words that occur in more than 1 category.
def PostProcessHash(cat_tficf):
    word_cat_count = {}
    for (cat, lst) in cat_tficf.items():
        for x in lst:
            if word_cat_count.has_key(x[0]):
                word_cat_count[x[0]] += 1
            else:
                word_cat_count[x[0]] = 1
    for (cat, lst) in cat_tficf.items():
        pruned_lst = [x for x in lst if word_cat_count[x[0]] == 1]
        cat_tficf[cat] = pruned_lst
    return cat_tficf


# Function that processes the input arguments.
def ProcessArguments(argv):
    found_l = False
    found_d = False
    found_k = False
    found_i = False
    found_t = False
    found_s = False
    found_b = False
    found_m = False
    abort = False
    stopwords_file = None
    dedup = False

    try:
        options, args = getopt.getopt(sys.argv[1:],'hl:d:k:i:t:s:b:m:u',['help','label=','dir=','keywords=','index=','test=','stopwords=','baseline=','mode=','unique'])
    except getopt.GetoptError as error:
        # Print error and usage
        print str(error)
        PrintUsage()
        sys.exit(2)
    # Process arguments
    for opt, arg in options:
        if opt in ('-h', '--help'):
            # Help message
            PrintUsage()
            sys.exit()
        elif opt in ('-l', '--label'):
            # Training label file
            train_label_file = arg
            found_l = True
            # print 'Training label file is:', train_label_file
        elif opt in ('-d', '--dir'):
            # Wordgrp_dir
            wordgrp_dir = arg
            found_d = True
            # print 'Wordgrp directory is:', wordgrp_dir
        elif opt in ('-k', '--keywords'):
            # Keywords file
            keywords_file = arg
            found_k = True
            # print 'Keywords file is:', keywords_file
        elif opt in ('-i', '--index'):
            # Index file
            index_file = arg
            found_i = True
            # print 'Index file is:', index_file
        elif opt in ('-t', '--test'):
            # Test label file
            test_label_file = arg
            found_t = True
            # print 'Test label file is:', test_label_file
        elif opt in ('-s', '--stopwords'):
            # Stopwords file
            stopwords_file = arg
            found_s = True
            # print 'Stopwords file is:', stopwords_file
        elif opt in ('-b', '--baseline'):
            # Baseline label file
            baseline_label_file = arg
            found_b = True
            # print 'Baseline label file is:', baseline_label_file
        elif opt in ('-m', '--mode'):
            # Mode
            mode = arg
            found_m = True
            # print 'Mode is:', mode
        elif opt in ('-u', '--unique'):
            # Dedup data
            dedup = True

    # Check if arguments are given
    if not found_l:
        print 'Required option -l not given'
        abort = True
    if not found_d:
        print 'Required option -d not given'
        abort = True
    if not found_k:
        print 'Required option -k not given'
        abort = True
    if not found_i:
        print 'Required option -i not given'
        abort = True
    if not found_t:
        print 'Required option -t not given'
        abort = True
    if not found_s:
        print 'Required option -s not given'
        abort = True
    if not found_b:
        print 'Required option -b not given'
        abort = True
    if abort:
        PrintUsage()
        sys.exit(2)
    else:
  	mode = "accuracy"
        return train_label_file, wordgrp_dir, keywords_file, index_file, test_label_file, stopwords_file, baseline_label_file, mode, dedup


# Function for printing the usage of the program.
def PrintUsage():
    print 'Usage:'
    print '\tSample Run: python tficf_based_classification.py -l train.dedup.labels -d WORD_GRP_UK -k KeywordGroups_UK.txt -i MASTER.Onion.Index_UK.csv -t test.dedup.labels -s stopwords.txt -b baseline_weapons_UK.txt'

if __name__ == '__main__':
    main(sys.argv[1:])


