/* A Java program to generate Word group files from text documents. 
   Required to run tficf + classifiers.

   USAGE: java GenerateWordGrp [input file]
*/

import java.io.*;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.Collections;
import java.util.Enumeration;

public class GenerateWordGrp 
{
	private Hashtable<String,Integer> vocab;

	public GenerateWordGrp(String ipfile)
	{
		vocab = new Hashtable<String,Integer>();

		createVocab(ipfile);
		generate();
	}

	public void createVocab(String ipfile)
	{
		try 
		{
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(ipfile)));
        		String line = null;
 
        		while((line = br.readLine()) != null) 
			{
				StringTokenizer st = new StringTokenizer(line, " \t\n\r\f,.:;?![]'"); //all possible delimiters

				while(st.hasMoreTokens())
				{
					String word = st.nextToken();
					word = word.toLowerCase();
					if(word.length() <=2 || word.charAt(0) < 'a' || word.charAt(0) > 'z') 
						continue;

					if(!vocab.containsKey(word))
						vocab.put(word, 1);
					else
					{
						int cnt = vocab.get(word);
						vocab.put(word, cnt+1);
					}
				}
			}

			br.close(); 
		} 
		catch(IOException ioe) 
		{
			System.out.println(ioe.toString());
		}
	}

	public void generate()
	{
		Enumeration<String> e = vocab.keys();
		while(e.hasMoreElements())
		{
			String word = (String) e.nextElement(); 
			int cnt = vocab.get(word);
			System.out.println(word+","+cnt+",0,0");
		}
	}

	public static void main(String args[])
	{
		//System.out.println(args[0]);
		GenerateWordGrp gwg = new GenerateWordGrp(args[0]);
	}
}
