"""
=======================================================================================
The following is an implementation of the unsupervised K-means, 
a simple variant of the MuSSCO algorithm.


Usage: Same as mussco.py 

"""

from time import time

import os, sys, getopt, codecs
sys.path.append("/home/ariyam/pkgs/")

import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import load_files
#from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy import sparse
from sklearn.preprocessing import normalize
from random import randint
from sklearn.metrics import pairwise_distances


# Function that processes the input arguments.
def ProcessArguments():
    stopwords_file = None
    keywords_file = None
    labels_file = None
    input_dir = None
    num_clusters = 0
    strict_test = None
    debug_option = False
    noise = 0
    must_link_file = None

    found_s = False
    found_k = False
    found_l = False
    found_i = False
    found_c = False
    found_t = False
    found_n = False
    found_m = False

    try:
        options, args = getopt.getopt(sys.argv[1:],'s:k:l:i:c:t:d:n:m:')
    except getopt.GetoptError as error:
        # Print error 
        print str(error)
        sys.exit(1)

    # Process arguments
    for opt, arg in options:
        if opt in ('-s'):	#stopwords file
            stopwords_file = arg
            found_s = True
        elif opt in ('-k'):	#keywords file
            keywords_file = arg
            found_k = True
        elif opt in ('-i'):	#input data directory
            input_dir = arg
            found_i = True
        elif opt in ('-l'):	#labels file
            labels_file = arg
            found_l = True
        elif opt in ('-c'):	#number of clusters
            num_clusters = int(arg)
            found_c = True
        elif opt in ('-t'):	#test file, if any
            strict_test = arg
            found_t = True
        elif opt in ('-d'):	#debug option
            if arg.upper() == 'ON':
            	debug_option = True
        elif opt in ('-n'):	#noise percentage
            noise = arg
            found_n = True
        elif opt in ('-m'):	#must-link file
            must_link_file = arg
            found_m = True
        else:
	    print("I don't understand one of your options. Only options allowed are: ")
	    print("-s ")
	    print("-k ")
	    print("-l ")
	    print("-i ")
	    print("-c ")
	    print("-t ")
	    print("-d ")
	    print("-n ")
	    print("-m ")
	    print("\n Aborting... Try again!")
            sys.exit(1)

    # Check if arguments are given
    if not found_k:
        print 'Required option -k not given'
        sys.exit(1)
    if not found_l:
        print 'Required option -l not given'
        sys.exit(1)
    if not found_i:
        print 'Required option -i not given'
        sys.exit(1)
    if not found_c:
        print 'Required option -c not given'
        sys.exit(1)
    if not found_m:
        print 'Required option -m not given'
        sys.exit(1)

    print("=== Mandatory Arguments ===")
    print("* Keywords file by experts: %s" % (keywords_file))
    print("* Seeded labels file: %s" % (labels_file))
    print("* Must-Link file computed using provenance: %s" % (must_link_file))
    print("* Input data directory: %s" % (input_dir))
    print("* No. of clusters: %s \n" % (num_clusters))

    print("=== Optional Arguments ===")
    print("* Stopwords file: %s" % (stopwords_file))
    if found_t:
    	print("* Test file [strict-test mode is ON]: %s" % (strict_test))
    	print("* Noise Perturbation (in percent): %s" % (noise))
    else:
    	print("* Test file [strict-test mode is OFF]: %s" % (strict_test))
    	print("* Since testing is OFF, noise is set to: %s" % (noise))
    print("* Run Debug: %s \n" % (debug_option))

    return stopwords_file, keywords_file, labels_file, input_dir, num_clusters, strict_test, debug_option, noise, must_link_file

# Function that processes the stopwords file and creates the stopwords list.
def ProcessStopWordsFile(stopwords_file):
	swl = []
	if stopwords_file == None:
		return swl

	t0 = time()
	f = codecs.open(stopwords_file, 'r', encoding='utf-8')
	lines = f.readlines()
	for line in lines:
		if line.startswith('#'):
        		continue
	  	stopword = line.rstrip('\n')
          	swl.append(stopword)

	print("Processed %s in %0.3fs." % (stopwords_file, (time() - t0)))
	print("Stopwords list created... \n")
	return swl

# Function that processes the input data directory. 
def ProcessInputDataDirectory(input_dir, swl):
	print("Loading dataset %s ..." % (input_dir))

	t0 = time()
	dataset = load_files(input_dir, shuffle=False, encoding='utf-8', decode_error='strict') #use utf-8 encoding codec and raise errors, if any, during decoding
	print("Loading done in %0.3fs." % (time() - t0))

	if len(swl) > 0:
		#===========================================================================================================================================================================================
		# * For TF-IDF vectorizer, I have just mentioned the most relevant parameters during the init, although most of them use the default values.
		# * Check: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html for definition of all possible parameters.
		# * Check: https://github.com/scikit-learn/scikit-learn/blob/51a765a/sklearn/feature_extraction/text.py#L1052 for the default values of the parameters.
		# 1. max_df -> maximum document frequency = 1.00 (present in all docs), min_df -> minimum document frequency = 1 (present in just 1 document)
		# 2. analyzer='word' -> tokenization based on words with all punctuations and white spaces treated as delimiters.
		# 3. ngram_range -> (1,1) means only individual words considered as features. We will change the upper bound to higher value later on to consider triplets and beyond. 
		# 4. norm='l2' -> normalize the tf-idf vector, use_idf=True -> use idf weighting, smooth_idf=True -> use Laplacian smoothing to avoid zero division errors.
		# 5. sublinear_tf=False -> use absolute tf and not log(tf), lowercase=True -> change all characters to lowercase
		# 6. Use utf-8 encoding codec and raise errors, if any, during decoding 
		#===========================================================================================================================================================================================
		#tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=1, stop_words=swl, analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
		tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=10, stop_words=swl, analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
	else:	#default english stopwords list
		#tfidf_vectorizer = TfidfVectorizer(max_df=1.00, min_df=1, stop_words='english', analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
		#tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=30, stop_words='english', analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')
		tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=10, stop_words=None, analyzer='word', ngram_range=(1, 1), norm='l2', lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, encoding='utf-8', decode_error='strict')

	t0 = time()
	transfrmed_input = tfidf_vectorizer.fit_transform(dataset.data)
	tfidf_vocab = tfidf_vectorizer.get_feature_names()
	print("No. of features extracted = %s." % (len(tfidf_vocab)))
	print("TFIDF conversion with stop words filtering done in %0.3fs. \n" % (time() - t0))

	return transfrmed_input, tfidf_vocab, dataset.filenames, dataset.target, dataset.target_names 

# Function that processes the seeded labels file and creates a <onion id, label id> map.
# onion id -> i means the ith document or onion.
# label id -> j means the jth corresponding label (if the labels were arranged alphabetically)
# Caution: For now, we are ONLY considering UNILABEL seeds.
def ProcessSeededLabelsFile(seeded_labels_file, path, filenames, labelnames):
	seeded_labels = {}

	print("Processing file: %s ..." % (seeded_labels_file))

	t0 = time()
	f = codecs.open(seeded_labels_file, 'r', encoding='utf-8')
	lines = f.readlines()
	for line in lines:
		if line.startswith('#'):
        		continue
	  	tokens = line.split(':')
		onionname = tokens[0].strip() #get the onion name
		associated_labels_list = tokens[1].split(',') #get the list of labels associated with the onion
		for item in associated_labels_list:
			label = item.strip()
			onionname_with_path = path+"/"+label+"/"+onionname
			indx = FindInList(onionname_with_path, filenames.tolist())
			if indx == -1:	#this indicates the onion with the corresponding label is absent as an input point, so ignore it
				continue
			labelid = FindInList(label, labelnames)
			seeded_labels[indx] = labelid	# unilabel seeds
			print("Processed seed: %s" % (onionname_with_path))
	print("Processed seeded labels file in %0.3fs. \n" % (time() - t0))

	return seeded_labels


# Function that processes the must-link onions file and creates all onions x onions sparse adjacency matrix M
# M[i][j] = 1, indicates that there exists a must-link between onion i and onion j.
# M[i][j] = 0, otherwise.
def ProcessMustLinkFile(must_link_file, filepaths):
	num_pts = len(filepaths)
	M = sparse.lil_matrix((num_pts, num_pts))

	fnames = []
	for i in range(0,num_pts):
		tokens = filepaths[i].split('/')
		fnames.append(tokens[len(tokens)-1])	#adding the onionname (same as the filename)

	print("Processing file: %s ..." % (must_link_file))

	t0 = time()
	f = codecs.open(must_link_file, 'r', encoding='utf-8')
	lines = f.readlines()
	for line in lines:
		if line.startswith('#'):
        		continue
	  	tokens = line.split(':')
		onionname1 = tokens[0].strip() 		#get the first onion name
		onionname2 = tokens[1].strip() 		#get the second onion name
		onionindex1 = FindInList(onionname1, fnames)	#get the indexes
		onionindex2 = FindInList(onionname2, fnames)	
		if onionindex1 == -1 or onionindex2 == -1:	
			continue			
		M[onionindex1,onionindex2] = 1		#must-link property is commutative
		M[onionindex2,onionindex1] = 1

	print("Processed must-link file in %0.3fs. \n" % (time() - t0))

	return M


# Function that processes the must-link neighbors of a given point p.
# If must-link neighbors are present, their majority label id is returned. For majority vote calculation, we only consider pts with ids less than p.
# If must-link neighbors are absent, -1 is returned.
# p <- pt id, M <- must-link matrix, L <- labels, nc <- number of clusters 
def AnalyseMustLinkNeighbors(p, M, L, nc):
	s = 0
	cls = np.zeros(int(nc))

	for i in range(0,p):
		if M[p,i] == 1:
			cls[int(L[i])] += 1
			s += 1

	if s == 0:		#no must-link neighbors under consideration
		return -1
	else:
		lst = cls.tolist()
		return lst.index(max(lst))


# Function that processes the must-link neighbors of a given point p and a given label id l.
# If must-link neighbors are present and there are some that matches the label l, then the number of such neighbors are returned. 
# Caution: For the computation as above, we only consider pts with ids less than p.
# If must-link neighbors with label l are absent, 0 is returned.
# p <- pt id, M <- must-link matrix, L <- corresponding labels, l <- given input label, nc <- number of clusters 
def AnalyseMustLinkNeighborsForGivenLabel(p, M, L, l, nc):
	cls = np.zeros(int(nc))

	for i in range(0,p):
		if M[p,i] == 1:
			cls[int(L[i])] += 1

	return cls[int(l)]


# Function that processes the test file.
# All the onions mentioned in the test file are removed from the input training data.
# NOTE: If there is a overlap between test onions and seeded onions, then we ignore and do not consider those onions for seeding.
# label id -> j means the jth corresponding label (if the labels were arranged alphabetically)
# Caution: For now, we are ONLY considering UNILABEL points. NO OVERLAPPING labels!
def ProcessTestFile(strict_test_file, path, filenames, y_labels, labelnames, tfidf_matrix):
	test_recs = []
	test_labl = []
	mat = tfidf_matrix.toarray()	#conversion required for deletion

	print("Processing file: %s ..." % (strict_test_file))

	t0 = time()
	f = codecs.open(strict_test_file, 'r', encoding='utf-8')
	lines = f.readlines()
	for line in lines:
		if line.startswith('#'):
        		continue
	  	tokens = line.split(':')
		onionname = tokens[0].strip() #get the onion name
		associated_labels_list = tokens[1].split(',') #get the list of labels associated with the onion
		for item in associated_labels_list:
			label = item.strip()
			onionname_with_path = path+"/"+label+"/"+onionname
			indx = FindInList(onionname_with_path, filenames.tolist())
			if indx == -1:	#this indicates the onion with the corresponding label is absent as an input point, so ignore it
				continue
			labelid = FindInList(label, labelnames)

			test_recs.append(mat[indx,:])	 	#add the corresponding points...
			test_labl.append(labelid)

			mat = np.delete(mat, (indx), axis=0) 	#delete the corresponding input training point
			filenames = np.delete(filenames, (indx)) 
			y_labels = np.delete(y_labels, (indx)) 

			print("Processed test onion: %s" % (onionname_with_path))

	print("Processed test file in %0.3fs. \n" % (time() - t0))

	tfidf_matrix = sparse.csr_matrix(mat)	#sparse formats
	te_recs = sparse.csr_matrix(test_recs)
	return te_recs, test_labl, filenames, y_labels, tfidf_matrix


# Function that perturbs the seeded data with some noise.
# The function is called ONLY when we are testing and noise percent > 0.
# Also we NEVER CHANGE the actual labels of the original training data. We only ALTER labels of the seeds. 
# Otherwise it will mess up our precision, recall, and F1-measure calculation. 
# Noise perturbation = n: 
#	Means n% of the seeded data are randomly selected and assigned a random label.
# noise <- noise percent, labelnames <- vector containing names of all the different labels
# seeded_labels <- dictionary: training point ids which have been used as seeds are KEYS, corresponding labels are VALUES. 
def PerturbWithNoise(seeded_labels, labelnames, noise):
	num_clusters = len(labelnames)
	block = []
	for z in range(0,num_clusters):
		block.append(z)

	num_perturbs = 0
	num_seeds = 0
	frac = int(100/int(noise))
	for i in seeded_labels:
		r = randint(1, frac)
		if r == frac:					#perturb with noise, probability = (noise/100)
			old_label = seeded_labels[i]
			block.remove(old_label)
			n = randint(0,len(block)-1) 
			seeded_labels[i] = block[n] 		#randomly changing the cluster label
			print("Randomly perturbed seed %s from %s to %s ..." % (str(i), labelnames[old_label], labelnames[seeded_labels[i]]))
			block.append(old_label)
			num_perturbs += 1
		num_seeds += 1

	print("Randomly perturbed %s of %s seeds ... \n" % (str(num_perturbs), str(num_seeds)))

	return seeded_labels


# Function that processes the keywords file provided by the domain experts and creates manual topics with the given word/phrase lists.
# manual_topics -> a sparse matrix of dimension (num of clusters x vocabulary length).
# If there are clusters whose manual keywords are absent in the corpus or whose manual keywords and phrases are not given by the experts, 
# their corresponding manual topics will be assigned with zero vector.
# manual_topic_indexes -> marks the indexes of the topics that have been properly initialized (not with some zero vector) as 1 or True. Rest are 0 or False.
# manual_topic_indexes -> This will be used in an optimization trick that will save lots of computational time later on. 
def ProcessKeywordsFile(keywords_file, labelnames, vocabulary, n_clusters):
	n_feats = len(vocabulary) #number of features i.e. number of words or phrases generated by the TFIDF combination
	manual_topics = sparse.lil_matrix((n_clusters, n_feats))
	manual_topic_indexes = np.zeros(n_clusters)

	print("Processing file: %s ..." % (keywords_file))

	t0 = time()
	f = codecs.open(keywords_file, 'r', encoding='utf-8')
	lines = f.readlines()
	for line in lines:
		if line.startswith('#'): #skip comment
        		continue
	  	tokens = line.split(':')

		topicname = tokens[0].strip() #get the cluster/topic name
		topic_index = FindInList(topicname, labelnames)
		if topic_index == -1:	#belongs to some wierd cluster, not drawn from the same data distribution as the input data does not have any point in this cluster 
			continue

		associated_phrase_list = tokens[1].split(',') #get the list of phrases/words associated with the topic/cluster
		for item in associated_phrase_list:
			phrase = item.strip().lower() #converting the phrase to lowercase to keep it consistent with the TF-IDF vocabulary and feature names
			phrase_index = FindInList(phrase, vocabulary)
			if phrase_index == -1:	#this indicates a phrase not yet present in the input corpus, may be the expert cooked it up!
				continue
			manual_topics[topic_index, phrase_index] = 1 	#for now, we are initializing it with '1'.
			if manual_topic_indexes[topic_index] == 0: 
				manual_topic_indexes[topic_index] = 1	#this topic/cluster now assigned based on some expert keywords matching the corpus
				print("Processed manual keywords for topic: %s" % (topicname))

	print("Processed manual keywords file in %0.3fs. " % (time() - t0))
	manual_topics = normalize(manual_topics, norm='l2', axis=1, copy=False).tolil()	#l2 normalization by row for each cluster/topic 
	print("* L2 normalization of cluster centers done after manual topic assignment ... \n")

	return (manual_topics, manual_topic_indexes)

# random assignment of initial topics/clusters, when expert provided manual keywords are absent or did not work.
def InitializeClusterCenters(matrix, is_assigned, all_topics):
	flag=0

	n_topics, n_feats = matrix.shape
	for i in range(0,n_topics):
		if is_assigned[i] == 0:					#topic i not yet assigned
			for j in range(0,n_feats):
				if randint(0,99) == 0:			#roughly 1% sparse
					matrix[i,j] = randint(1,10)	#randomly assign an integer between 1 and 10
			print("Randomly initialized topic: %s" % (all_topics[i]))
			flag = 1

	if flag==1:
		matrix = normalize(matrix, norm='l2', axis=1).tolil() 
		print("* L2 normalization of cluster centers done due to random init ... \n")
	else:
		print("* cluster centers initialized as the manual topics ... \n")

	return matrix
	

# use means of the seeded documents to improve the cluster centers
# works irrespective of whether experts have provided domain specific keywords or not
# finally, randomly initialize cluster centers/topics, which do not have any seeded documents and also expert provided keywords
def UseSeedsToImproveClusterCenters(matrix, is_assigned, all_topics, input_corpus, seeded_labels):
	flag_randomInit = 0
	flag_seedInit = 0

	for k in seeded_labels:
		matrix[seeded_labels[k],:] += input_corpus[k,:]
		flag_seedInit = 1		#seeds used during initialization
		if is_assigned[seeded_labels[k]] == 0:
			print("Initialized topic from seeds: %s" % (all_topics[seeded_labels[k]]))
			is_assigned[seeded_labels[k]] = 1

	n_topics, n_feats = matrix.shape
	for i in range(0,n_topics):
		if is_assigned[i] == 0:					#topic i not yet assigned
			for j in range(0,n_feats):
				if randint(0,99) == 0:			#roughly 1% sparse
					matrix[i,j] = randint(1,10)	#randomly assign an integer between 1 and 10
			print("Randomly initialized topic: %s" % (all_topics[i]))
			flag_randomInit = 1	#randomly initialized, since no corresponding seed OR expert keywords given

	if flag_randomInit + flag_seedInit > 0:
		matrix = normalize(matrix, norm='l2', axis=1).tolil() 	#normalize

	if flag_randomInit == 1:
		print("* L2 normalization of cluster centers done due to random init ... ")
	if flag_seedInit == 1:
		print("* L2 normalization of cluster centers done as seeds used during initialization ... ")
	if flag_randomInit + flag_seedInit == 0:
		print("* cluster centers initialized as the manual topics ... \n")
	else:
		print("\n")

	return matrix
	

# Function returns the index of an element occuring in the list, if the element is present.
# If the element is not present, the function returns -1.
# Because of the last criteria, we cannot use python's in-built index method for lists directly.
def FindInList(elem, theList):
	try:
		indx = theList.index(elem)
	except ValueError:
		indx = -1
	return indx
	

# The following calculates the value of the objective function.
# X <- tfidf pts (normalized), M <- current cluster centers, T <- manual topics as generated from expert keywords
# S <- seeded labels, L <- actual labels assigned by the algorithm
# hyper-parameters: u <- weighs closeness to manual topics, v <- scores on matching seeded labels
# 		    w <- weight for incorporating must-link neighbors
# A <- must-link adjacency matrix
def ObjFuncVal(X, M, T, S, L, u, v, w, A):
	objval = 0

	pts, feats = X.shape
	clusters, feats = M.shape
	for i in range(0,pts):
		labl_asgnd = int(L[i])
		objval += (X[i,:].multiply(M[labl_asgnd,:])).sum(axis=None) #1st component
		if i in S:	#is this onion seeded?
			if labl_asgnd == int(S[i]):	#when label matches
				objval += v		#3rd component 
		objval += w * AnalyseMustLinkNeighborsForGivenLabel(i, A, L, labl_asgnd, clusters) # provenance - 4th component

	objval += u * (T.multiply(M)).sum(axis=None) 	#2nd component

	return objval
	

# The following is a debug function to understand how the objective function value changes.
# It will print the contribution of each point towards the objective function value, how it changed (increased/decreased)
# and how the overall function value changed (increased/decreased) and why?
# X <- tfidf pts (normalized), M <- current cluster centers, T <- manual topics as generated from expert keywords
# S <- seeded labels, L <- actual labels assigned by the algorithm
# hyper-parameters: u <- weighs closeness to manual topics, v <- scores on matching seeded labels
# 		    w <- weight for incorporating must-link neighbors
# A <- must-link adjacency matrix
# P <- #pts x #components matrix representing the contribution of each point towards the objective function value.
# Q <- 1D array representing the contribtion of each topic towards the objective function value.
# string <- a prefix string for each iteration of running the debug...
def RunDebug(X, M, T, S, L, u, v, w, A, P, Q, string):
	simi_net_delta = 0
	seed_net_delta = 0
	topic_net_delta = 0
	provenance_net_delta = 0
	overall_delta = 0

	pts, feats = X.shape
	topics, feats = T.shape
	clusters, feats = M.shape

	for i in range(0,pts):
		labl_asgnd = int(L[i])

		oldval = P[i,0]
		currval = (X[i,:].multiply(M[labl_asgnd,:])).sum(axis=None) #1st component for point i
		delta = currval - oldval
		print("%s <> Point: %s <> Similarity-Component Delta: %s <> %s" %(string, str(i), str(delta), comment(delta))) 
		P[i,0] = currval
		simi_net_delta += delta

		oldval = P[i,1]
		currval = 0		
		if i in S:				#is this onion seeded?
			if labl_asgnd == int(S[i]):	#when label matches
				currval = v		#3rd component for point i
		delta = currval - oldval
		print("%s <> Point: %s <> Seed-Component Delta: %s <> %s" %(string, str(i), str(delta), comment(delta))) 
		P[i,1] = currval
		seed_net_delta += delta

		oldval = P[i,2]
		currval = w * AnalyseMustLinkNeighborsForGivenLabel(i, A, L, labl_asgnd, clusters) # provenance (4th) component
		delta = currval - oldval
		print("%s <> Point: %s <> Provenance-Component Delta: %s <> %s" %(string, str(i), str(delta), comment(delta))) 
		P[i,2] = currval
		provenance_net_delta += delta

	for i in range(0,topics):
		oldval = Q[i]
		currval = u * (T.multiply(M)).sum(axis=None) 	#2nd component for topic i
		delta = currval - oldval
		print("%s <> Topic: %s <> Topic-Component Delta: %s <> %s" %(string, str(i), str(delta), comment(delta))) 
		Q[i] = currval
		topic_net_delta += delta

	print("%s <> Net Similarity-Component Delta: %s <> %s" %(string, str(simi_net_delta), comment(simi_net_delta))) 
	print("%s <> Net Seed-Component Delta: %s <> %s" %(string, str(seed_net_delta), comment(seed_net_delta))) 
	print("%s <> Net Topic-Component Delta: %s <> %s" %(string, str(topic_net_delta), comment(topic_net_delta))) 
	print("%s <> Net Provenance-Component Delta: %s <> %s" %(string, str(provenance_net_delta), comment(provenance_net_delta))) 

	overall_delta = simi_net_delta + seed_net_delta + topic_net_delta + provenance_net_delta
	print("%s <> Overall change: %s <> %s" %(string, str(overall_delta), comment(overall_delta))) 

	return (P,Q)

	
# Following function used to comment on the delta change.
def comment(val):
	if val<0:
		str='DECREASE'
	elif val>0:
		str='INCREASE'
	else:
		str='NO CHANGE'

	return str


# The following assigns the points to the different clusters.
# X <- tfidf pts (normalized), M <- current cluster centers
# S <- seeded labels, L <- actual labels assigned by the algorithm
# hyper-parameters: v <- for scoring on matching seeded labels
# A <- must-link adjacency matrix
# In the following implementation, must-link neighbors given absolute priority.
# This is a very strict enforcement. 
# Details at http://nichol.as/papers/Wagstaff/Constrained%20k-means%20clustering%20with%20background.pdf
def AssignClusterStrictConstraint(X, M, S, v, A):
	pts, feats = X.shape
	clusters, feats = M.shape
	L = np.zeros(pts)	#trivial initialization
	for i in range(0,pts):
		c = AnalyseMustLinkNeighbors(i, A, L, clusters)	#analyse must-link neighbors using labels that are assigned this iteration
		if c > -1:					#got the majority label from the must-link neighbors
			L[i] = c
			continue
							#when we don't get a label, assign it to the nearest cluster
		max_similarity = 0
		for c in range(0,clusters):
			similarity_c = (X[i,:].multiply(M[c,:])).sum(axis=None) #cosine similarity
			if i in S and c == int(S[i]):	#when this onion is seeded and the label matches the seed
				similarity_c += v	#bias the similarity in this case
			if similarity_c > max_similarity:
				max_similarity = similarity_c
				L[i] = c		#update label

	return L
	

# The following assigns the points to the different clusters.
# X <- tfidf pts (normalized), M <- current cluster centers
# S <- seeded labels, L <- actual labels assigned by the algorithm
# hyper-parameters: v <- for scoring on matching seeded labels, w <- weight for incorporating must-link neighbors
# A <- must-link adjacency matrix
# In the following implementation, must-link neighbors are factored into the computation of the objective function. 
# Some flexibility is allowed using the hyper-parameter w introduced in the objective function. 
def AssignCluster(X, M, S, v, w, A):
	pts, feats = X.shape
	clusters, feats = M.shape
	L = np.zeros(pts)	#trivial initialization
	for i in range(0,pts):
		max_similarity = 0
		for c in range(0,clusters):
			similarity_c = (X[i,:].multiply(M[c,:])).sum(axis=None) #cosine similarity
			if i in S and c == int(S[i]):	#when this onion is seeded and the label matches the seed
				similarity_c += v	#bias the similarity in this case
			similarity_c += w * AnalyseMustLinkNeighborsForGivenLabel(i, A, L, c, clusters)	#the provenance consideration
			if similarity_c > max_similarity:
				max_similarity = similarity_c
				L[i] = c	#update label

	return L
	

# The following calculates the value of the objective function.
# X <- tfidf pts (normalized), T <- manual topics as generated from expert keywords
# L <- actual labels assigned by the algorithm
# hyper-parameters: u <- weighs closeness to manual topics
# M <- current cluster centers
def UpdateClusterCenters(X, T, L, u):
	pts, feats = X.shape
	clusters, feats = T.shape
	M = sparse.lil_matrix((clusters,feats))		# trivial initialization
	num_pts = np.zeros(clusters)			# array to store number of points in each cluster

	for i in range(0,pts):
		labl_asgnd = int(L[i])
		num_pts[labl_asgnd] += 1
		M[labl_asgnd,:] += X[i,:] 		# summing up for the cluster means 

	for c in range(0,clusters):
		if num_pts[c] > 0:
			M[c,:] += u * T[c,:]		# also biasing the cluster centers with the manual topics
			M[c,:] /= num_pts[c]		# calculating the cluster means

	M = normalize(M, norm='l2', axis=1, copy=False).tolil()	#l2 normalization by row for each cluster center
	return M
	

# The following performs the clustering algorithm.
# X <- tfidf pts (normalized), M <- current cluster centers, T <- manual topics as generated from expert keywords
# S <- seeded labels, L <- actual labels assigned by the algorithm
# hyper-parameters: u <- weighs closeness to manual topics, v <- scores on matching seeded labels,
# 		    w <- weight for incorporating must-link neighbors
# Convergence criteria: Reaches max_iterations or difference in objective function less than the tolerance
# max_iter <- max iterations, tol <- tolerance
# debug <- True or False
# A <- must-link adjacency matrix
# In the following implementation, must-link neighbors are factored into the computation of the objective function. 
# Some flexibility is allowed using the hyper-parameter w introduced in the objective function. 
def DoClustering(X, M, T, S, u, v, w, A, tol, max_iter, debug):
	print("=== Settings ===") 
	print("* Tolerance: %s ..." %(str(tol)))
	print("* Max iterations: %s ..." %(str(max_iter)))
	print("* Hyperparameter u (weighs closeness to manual topics): %s ..." %(str(u)))
	print("* Hyperparameter v (importance of matching seeded labels): %s ..." %(str(v)))
	print("* Hyperparameter w (importance of must-link neighbors): %s ..." %(str(w)))
	print("* Run Debug: %s ..." %(debug))
	print("=== *** === \n") 

	print("=== Beginning the clustering algorithm ===") 

	pts, feats = X.shape
	topics, feats = T.shape
	comps = 3
	P = np.zeros((pts,comps))
	Q = np.zeros(topics)

	t0 = time()
	curr_objval = 0.0
	for i in range(0,max_iter):
		L = AssignCluster(X, M, S, v, w, A)	#e-step
		print("* Iteration %s: Cluster assigned ..." %(str(i)))
		sys.stdout.flush()

		M = UpdateClusterCenters(X, T, L, u) 	#m-step
		print("* Iteration %s: Centers updated ..." %(str(i)))
		sys.stdout.flush()

		prev_objval = curr_objval
		curr_objval = ObjFuncVal(X, M, T, S, L, u, v, w, A)
		print("* Iteration %s: Objective value = %0.6f." %(str(i), curr_objval)) 
		sys.stdout.flush()

		diff = curr_objval - prev_objval
		print("* Iteration %s: Change in objective value = %0.6f." %(str(i), diff)) 
		sys.stdout.flush()

		if debug:
			string = 'Iteration ' + str(i) + ':'
			P, Q =  RunDebug(X, M, T, S, L, u, v, w, A, P, Q, string)
			sys.stdout.flush()

		if abs(diff) < tol:
			break

	print("Finishing the clustering algorithm in %0.3fs ...\n" %(time() - t0)) 
	return (M, L)
	

# The following performs the clustering algorithm.
# X <- tfidf pts (normalized), M <- current cluster centers, T <- manual topics as generated from expert keywords
# S <- seeded labels, L <- actual labels assigned by the algorithm
# hyper-parameters: u <- weighs closeness to manual topics, v <- scores on matching seeded labels,
# 		    w <- weight for incorporating must-link neighbors
# Convergence criteria: Reaches max_iterations or difference in objective function less than the tolerance
# max_iter <- max iterations, tol <- tolerance
# debug <- True or False
# A <- must-link adjacency matrix
# In the following implementation, must-link neighbors given absolute priority.
# This is a very strict enforcement. 
# Details at http://nichol.as/papers/Wagstaff/Constrained%20k-means%20clustering%20with%20background.pdf
def DoClusteringStrictConstraint(X, M, T, S, u, v, w, A, tol, max_iter, debug):
	print("=== Settings ===") 
	print("* Tolerance: %s ..." %(str(tol)))
	print("* Max iterations: %s ..." %(str(max_iter)))
	print("* Hyperparameter u (weighs closeness to manual topics): %s ..." %(str(u)))
	print("* Hyperparameter v (importance of matching seeded labels): %s ..." %(str(v)))
	print("* Hyperparameter w (importance of must-link neighbors): %s ..." %(str(w)))
	print("* Run Debug: %s ..." %(debug))
	print("* CAUTION: We are using the hyperparameter w = %s for objective value evaluation and comparison..." %(w))
	print("*          We NEVER use the hyperparameter w in cluster assignment...")
	print("=== *** === \n") 

	print("=== Beginning the clustering algorithm ===") 

	pts, feats = X.shape
	topics, feats = T.shape
	comps = 3
	P = np.zeros((pts,comps))
	Q = np.zeros(topics)

	t0 = time()
	curr_objval = 0.0
	for i in range(0,max_iter):
		L = AssignClusterStrictConstraint(X, M, S, v, A)  #e-step
		print("* Iteration %s: Cluster assigned ..." %(str(i)))
		sys.stdout.flush()

		M = UpdateClusterCenters(X, T, L, u) 		  #m-step
		print("* Iteration %s: Centers updated ..." %(str(i)))
		sys.stdout.flush()

		prev_objval = curr_objval
		curr_objval = ObjFuncVal(X, M, T, S, L, u, v, w, A)
		print("* Iteration %s: Objective value = %0.6f." %(str(i), curr_objval)) 
		sys.stdout.flush()

		diff = curr_objval - prev_objval
		print("* Iteration %s: Change in objective value = %0.6f." %(str(i), diff)) 
		sys.stdout.flush()

		if debug:
			string = 'Iteration ' + str(i) + ':'
			P, Q =  RunDebug(X, M, T, S, L, u, v, w, A, P, Q, string)
			sys.stdout.flush()

		if abs(diff) < tol:
			break

	print("Finishing the clustering algorithm in %0.3fs ...\n" %(time() - t0)) 
	return (M, L)
	


# The following summarizes the final results.
# Its prints the following:
# -> 1. Distribution of points for each cluster
# -> 2. For each onion, what is the original label and to which cluster it got assigned i.e. <onion-name, original cluster label, original label id, assigned cluster id>
# fL <- final assigned clusters for each point, iL <- initial assigned label ids for each point 
# onionnames <- names/description of the onions, labelnames <- original names/description of the labels 
def ResultSummary(fL, iL, onionnames, labelnames):
	print("=== SUMMARY ===") 

	pts = len(iL)
	clusters = len(labelnames)
	num_pts_i = np.zeros(clusters)
	num_pts_f = np.zeros(clusters)
	for i in range(0,pts):
		print("* Onion: %s, Original Label: %s, Original ID: %s, Assigned ID: %s" % (str(onionnames[i]), labelnames[iL[i]], str(iL[i]), str(fL[i])))
		num_pts_i[int(iL[i])] += 1
		num_pts_f[int(fL[i])] += 1
	print("\n")
	for i in range(0,clusters):
		print("* Original Label: %s, Original ID: %s, Original Pts: %s" % (labelnames[i], str(i), str(num_pts_i[i])))
	print("\n")
	for i in range(0,clusters):
		print("* Assigned Label ID: %s, Assigned Pts: %s" % (str(i), str(num_pts_f[i])))
	print("\n")

	print("=== * === * === \n") 
	

# For each cluster, print the description of the center in terms of the top keywords.
def ClusterCenterDescription(cluster_centers, vocab, max_keywords):
	print("=== CLUSTER CENTER DESCRIPTION  ===") 

	(clusters, feats) = cluster_centers.shape
	for c in range(0,clusters):
		print("\n* Cluster No. %s" % (str(c)))

		top_keywords_indx = []
		top_keywords_val = []
		for i in range(0,feats):
			if cluster_centers[c,i] == 0:
				continue
			if len(top_keywords_indx) < max_keywords:	# keep appending till you meet the limit
				top_keywords_indx.append(i)
				top_keywords_val.append(cluster_centers[c,i])
			elif cluster_centers[c,i] > min(top_keywords_val):	# is it one of the top keywords?
				tmp = top_keywords_val.index(min(top_keywords_val))	# replace the bottom most keyword
				top_keywords_indx[tmp] = i
				top_keywords_val[tmp] = cluster_centers[c,i]

		size = len(top_keywords_indx)
		for i in range(0,size):		# print in descending order of importance
			tmp = top_keywords_val.index(max(top_keywords_val))
			wt = top_keywords_val.pop(tmp)
			item = int(top_keywords_indx.pop(tmp))
			print("* Keyword %s: %s -> val: %s" % (str(i), vocab[item], str(wt)))
			
	print("=== * === * === \n") 
	

# For each cluster, print the corresponding confusion matrix i.e. the number of points belonging to each of the different classes within each cluster.
# asgn_labels <- assigned labels, orig_labels <- original labels, labelnames <- label descriptions
# return the final confusion matrix representing the clustering results, with assigned labels as rows and original labels as columns.
def getConfusionMatrix(asgn_labels, orig_labels, labelnames):
	num_clusters = len(labelnames)
	pts = len(orig_labels)
	cm = np.zeros((num_clusters, num_clusters))

	for i in range(0,pts):
		cm[int(asgn_labels[i]),int(orig_labels[i])] += 1
	
	for c in range(0,num_clusters):
		print("============================================") 
		print("*** CONFUSION MATRIX FOR CLUSTER %s ***" % (c) ) 
		print("============================================") 
		for cls in range(0,num_clusters):
			print("%s constituents: %s" % (labelnames[cls], str(cm[c][cls])))
		print("=== * === * === \n") 

	return cm
	

# Evaluate clustering performance using other standard metrics using built-in functions of Scikit
# labels_true <- actual/original labels
# labels_pred <- predicted labels
# X <- input data
def computeStandardClusteringMetrics(labels_true, labels_pred, X):
	
	print("============================================") 
	print("*** Standard Clustering Metrics - Part I ***") 
	print("============================================") 

	#1.
	arscore = metrics.adjusted_rand_score(labels_true, labels_pred) 
	print("* Overall Adjusted Rand score: %s\n" % (arscore))

	#2.
	miscore = metrics.mutual_info_score(labels_true, labels_pred)
	print("* Overall Mutual Info score: %s" % (miscore))
	amiscore = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
	print("* Overall Adjusted Mutual Info score: %s" % (amiscore))
	nmiscore = metrics.normalized_mutual_info_score(labels_true, labels_pred)
	print("* Overall Normalized Mutual Info score: %s\n" % (nmiscore))

	#3.
	hscore = metrics.homogeneity_score(labels_true, labels_pred)
	print("* Overall Homegeneity score: %s" % (hscore))
	cscore = metrics.completeness_score(labels_true, labels_pred)
	print("* Overall Completeness score: %s" % (cscore))
	vscore = metrics.v_measure_score(labels_true, labels_pred)
	print("* Overall V-measure score: %s \n" % (vscore))

	#4.
	fmscore = metrics.fowlkes_mallows_score(labels_true, labels_pred) 
	print("* Overall Fowlkes Mallows score: %s\n" % (fmscore))


	print("=============================================") 
	print("*** Standard Clustering Metrics - Part II ***") 
	print("=============================================") 

	#5.
	sscore = metrics.silhouette_score(X, labels_pred, metric='cosine')
	print("* Overall Silhouette score: %s\n" % (sscore))

	#6.
	chscore = metrics.calinski_harabaz_score(X, labels_pred) 
	print("* Overall Calinski Harabaz score: %s\n" % (chscore))



# Compute the pairwise precision, recall and f-measure to evaluate the clustering strategy.
# cm <- the confusion matrix representing the clustering results, with assigned labels as rows and original labels as columns.
def evaluateClustering(cm):
	corr_pred = 0	#true-positives (TP)
	tot_pred = 0	#true-positives + false-positives (TP+FP)
	tot_actual = 0	#true-positives + false-negatives (TP+FN)

	cltr, cls = cm.shape
	for i in range(0,cltr):
		for j in range(0, cls):
			pts = cm[i,j]
			corr_pred += pts * (pts - 1) / 2.0
		r_pts = cm[i,:].sum()
		tot_pred += r_pts * (r_pts - 1) / 2.0

	for j in range(0,cls):
		c_pts = cm[:,j].sum()
		tot_actual += c_pts * (c_pts - 1) / 2.0
		
	prec = corr_pred/tot_pred
	recall = corr_pred/tot_actual
	fmeas = 2 * prec * recall / (prec + recall)

	print("* Overall pairwise precision: %s" % (prec))
	print("* Overall pairwise recall: %s" % (recall))
	print("* Overall pairwise F1 measure: %s \n" % (fmeas))

	return (prec, recall, fmeas)


# The following assigns the test points to the corresponding clusters.
# X <- tfidf test pts (normalized), M <- current cluster centers
# O <- original labels of the test points
# labelnames <- description of the labels 
def RunTesting(X, M, O, labelnames):
	print("----------------------------------------- \n")
	print(" ... Beginning the testing process ... ")
	print("----------------------------------------- \n")

	pts, feats = X.shape
	clusters, feats = M.shape
	L = np.zeros(pts)	#trivial initialization
	for i in range(0,pts):
		max_similarity = 0
		for c in range(0,clusters):
			similarity_c = (X[i,:].multiply(M[c,:])).sum(axis=None) #cosine similarity
			if similarity_c > max_similarity:
				max_similarity = similarity_c
				L[i] = c	#assign to nearest cluster

	cm = getConfusionMatrix(L, O, labelnames)
	evaluateClustering(cm)

	computeStandardClusteringMetrics(O, L.tolist(), X.toarray())

	print("---------------------")
	print(" ... End of test ... ")
	print("---------------------")
	
# =========================================================================================================================


#The main scripts begins here...
#
#1. Process arguments
stopwords_file,keywords_file,seeded_labels_file,input_dir,num_cluster,strict_test_file,debug,noise,must_link_file = ProcessArguments()
sys.stdout.flush()

#Optional: Process stopwords file , if any
swl = ProcessStopWordsFile(stopwords_file)
sys.stdout.flush()

#2. Perform TF-IDF conversion
tfidf_pts_X, tfidf_vocab_X, filenames_X, labels_X, labelnames_X = ProcessInputDataDirectory(input_dir, swl)
sys.stdout.flush()

#Optional: Processing test file, if any
if strict_test_file != None:
	test_recs, test_labls, filenames_X, labels_X, tfidf_pts_X = ProcessTestFile(strict_test_file, input_dir, filenames_X, labels_X, labelnames_X, tfidf_pts_X)
	sys.stdout.flush()

#3. Process seeded labels file
seeded_labels = ProcessSeededLabelsFile(seeded_labels_file, input_dir, filenames_X, labelnames_X)
sys.stdout.flush()

#Optional: Noise perturbation, if any. Its valid ONLY when we are testing.
if strict_test_file != None and int(noise)>0:
	seeded_labels = PerturbWithNoise(seeded_labels, labelnames_X, noise)
	sys.stdout.flush()

#4. Process the manual list of keywords provided by domain experts
manual_topics, manual_topic_indexes = ProcessKeywordsFile(keywords_file, labelnames_X, tfidf_vocab_X, num_cluster)
sys.stdout.flush()

#5. Process the must-link onions file
must_link_matrix = ProcessMustLinkFile(must_link_file, filenames_X)
sys.stdout.flush()

#6. Cluster center initialization
# ---
#cluster_centers = UseSeedsToImproveClusterCenters(manual_topics, manual_topic_indexes, labelnames_X, tfidf_pts_X, seeded_labels)
# ---
cluster_centers = InitializeClusterCenters(manual_topics, manual_topic_indexes, labelnames_X)
sys.stdout.flush()

#7. Perform clustering -
#   Set tolerance, max_iterations and hyper-parameters u,v,w
u = 0.0
v = 0.0
w = 0.0
# ->>> Completely unsupervised k-means with keywords, must link and seeded labels being empty
cluster_centers, assigned_labels = DoClustering(tfidf_pts_X, cluster_centers, manual_topics, seeded_labels, u, v, w, must_link_matrix, 0.01, 30, debug)
#cluster_centers, assigned_labels = DoClusteringStrictConstraint(tfidf_pts_X, cluster_centers, manual_topics, seeded_labels, u, v, w, must_link_matrix, 0.01, 30, debug)
sys.stdout.flush()

#8. Print a result summary
ResultSummary(assigned_labels, labels_X, filenames_X, labelnames_X)
sys.stdout.flush()

#9. Print cluster center descriptions
ClusterCenterDescription(cluster_centers, tfidf_vocab_X, 50)
sys.stdout.flush()

#10. Print confusion matrices
cm_result = getConfusionMatrix(assigned_labels, labels_X, labelnames_X)
sys.stdout.flush()

#11. Print cluster evaluation metrics (pairwise precision, recall and f1 measure) & some other standard metrics
evaluateClustering(cm_result)
sys.stdout.flush()
computeStandardClusteringMetrics(labels_X, assigned_labels, tfidf_pts_X.toarray())
sys.stdout.flush()

#12. Run on the completely exclusive non-overlapping test set.
if strict_test_file != None:
	RunTesting(test_recs, cluster_centers, test_labls, labelnames_X)
	sys.stdout.flush()

# =========================================================================================================================

