"""
=======================================================================================
Keywords extraction for topics determined using 
	- Non-negative Matrix Factorization and 
	- Latent Dirichlet Allocation
=======================================================================================

Applying Non-negative Matrix Factorization
and Latent Dirichlet Allocation on a corpus of documents. 

Output: List of topics, each topic as set of keywords (overlapping allowed);
(weights are not shown).

Input parameters: n_topics (number of topics) and n_top_words (number of top words for each topic).  

# Acknowledgment: Olivier Grisel <olivier.grisel@ensta.org>

"""

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import numpy as np
import sys

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import load_files
#from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy import sparse
from sklearn.preprocessing import normalize
from random import randint
from sklearn.metrics import pairwise_distances

n_topics = 3
n_top_words = 100


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
	tmp_topic1 = topic	#modified
	tmp_topic2 = topic	#modified
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        #print(" ".join([str(tmp_topic1[i])
        #                for i in tmp_topic2.argsort()[:-n_top_words - 1:-1]]))
    print()


# Function that processes the input data directory. 
def ProcessInputDataDirectory(input_dir, swl):
	print("Loading dataset %s ..." % (input_dir))

	t0 = time()
	dataset = load_files(input_dir, shuffle=False, decode_error='ignore') #, encoding='utf-8', decode_error='strict') #use utf-8 encoding codec and raise errors, if any, during decoding
	print("Loading done in %0.3fs." % (time() - t0))

	if len(swl) > 0:
		#===========================================================================================================================================================================================
		# * For TF-IDF vectorizer, I have just mentioned the most relevant parameters during the init, although most of them use the default values.
		# * Check: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html for definition of all possible parameters.
		# * Check: https://github.com/scikit-learn/scikit-learn/blob/51a765a/sklearn/feature_extraction/text.py#L1052 for the default values of the parameters.
		# 1. max_df -> maximum document frequency = 1.00 (present in all docs), min_df -> minimum document frequency = 1 (present in just 1 document)
		# 2. analyzer='word' -> tokenization based on words with all punctuations and white spaces treated as delimiters.
		# 3. ngram_range -> (1,1) means only individual words considered as features. We will change the upper bound to higher value later on to consider triplets and beyond. 
		# 4. norm='l2' -> normalize the tf-idf vector, use_idf=True -> use idf weighting, smooth_idf=True -> use Laplacian smoothing to avoid zero division errors.
		# 5. sublinear_tf=False -> use absolute tf and not log(tf), lowercase=True -> change all characters to lowercase
		# 6. Use utf-8 encoding codec and raise errors, if any, during decoding 
		#===========================================================================================================================================================================================
		#tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=1, stop_words=swl, analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
		tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=10, stop_words=swl, analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, decode_error='ignore') #, encoding='utf-8', decode_error='strict')
	else:	#default english stopwords list
		#tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=1, stop_words='english', analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
		#tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=30, stop_words='english', analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
		tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=10, stop_words='english', analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, decode_error='ignore') #, encoding='utf-8', decode_error='strict')

	t0 = time()
	transfrmed_input = tfidf_vectorizer.fit_transform(dataset.data)
	tfidf_vocab = tfidf_vectorizer.get_feature_names()
	print("No. of features extracted = %s." % (len(tfidf_vocab)))
	print("TFIDF conversion with stop words filtering done in %0.3fs. \n" % (time() - t0))

	return transfrmed_input, tfidf_vocab, dataset.filenames, dataset.target, dataset.target_names 


idir = sys.argv[1]
tfidf, tfidf_feature_names, filenames_X, labels_X, labelnames_X = ProcessInputDataDirectory(idir,[])

# Fit the NMF model
print("Fitting the NMF model with tf-idf features")
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features") 
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
print_top_words(lda, tfidf_feature_names, n_top_words)
