"""
=======================================================================================
Keywords extraction for topics determined using 
	- Non-negative Matrix Factorization and 
	- Latent Dirichlet Allocation
=======================================================================================

Applying Non-negative Matrix Factorization
and Latent Dirichlet Allocation on a corpus of documents. 

Output: List of topics, each topic as set of keywords (overlapping allowed);
(weights are shown).

Input parameters: n_topics (number of topics) and n_top_words (number of top words for each topic).  

# Acknowledgment: Olivier Grisel <olivier.grisel@ensta.org>

"""

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import numpy as np
import sys

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import load_files
#from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy import sparse
from sklearn.preprocessing import normalize
from random import randint
from sklearn.metrics import pairwise_distances

n_topics = 3
n_top_words = 100


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
	tmp_topic1 = topic	#modified
	tmp_topic2 = topic	#modified
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print(" ".join([str(tmp_topic1[i])
                        for i in tmp_topic2.argsort()[:-n_top_words - 1:-1]]))
    print()


# Function that processes the input data directory. 
def ProcessInputDataDirectory(input_dir, swl):
	print("Loading dataset %s ..." % (input_dir))

	t0 = time()
	dataset = load_files(input_dir, shuffle=False, encoding='utf-8', decode_error='strict') #use utf-8 encoding codec and raise errors, if any, during decoding
	print("Loading done in %0.3fs." % (time() - t0))

	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                  stop_words='english')
	tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

	tfidf = tfidf_vectorizer.fit_transform(dataset.data)
	tfidf_vocab = tfidf_vectorizer.get_feature_names()

	tf = tf_vectorizer.fit_transform(dataset.data)
	tf_vocab = tf_vectorizer.get_feature_names()

	return tfidf, tfidf_vocab, tf, tf_vocab


idata = sys.argv[1]
tfidf, tfidf_vocab, tf, tf_vocab = ProcessInputDataDirectory(idata,[])

# Fit the NMF model
print("Fitting the NMF model with tf-idf features")
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
print_top_words(nmf, tfidf_vocab, n_top_words)

print("Fitting LDA models with tf features") 
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
print_top_words(lda, tf_vocab, n_top_words)
