This program reads files of words and lists their frequencies. A file of words to ignore can also be supplied. This example illustrates the use of Java 5's generic data structures (Sets, Maps, and ArrayList), regular expressions (split(), Pattern, and Matcher), and BufferedReader. A command-line interface is given, but an interesting exercise would be to supply a GUI interface.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
// wordfreq/WordFrequencyCmd.java -- Main program to test WordFrequency
// Fred Swartz - October 2004
import java.io.*;
import java.util.*;
/** Prints word frequency in source file. Ignores words in ignore file.
* Uses Sets, Maps, ArrayList, regular expressions, BufferedReader.
* @author Fred Swartz
* @version 2004-10-17
*/
class WordFrequencyCmd {
//========================================================= main
/** All code is put into main */
public static void main(String[] args) {
//-- Expects two file names on run command.
if (args.length != 2) {
System.out.println("Usage: java WordFrequency inputFile ignoreFile");
System.exit(1);
}
try {
//-- Supply to files to a WordFrequency object.
WordFrequency wf = new WordFrequency();
wf.ignoreFile(new File(args[1]));
wf.processFile(new File(args[0]));
//-- Get the results.
int n = wf.getEntryCount();
String[] wrds = new String[n];
int[] frequency = new int[n];
wf.getWordFrequency(wrds, frequency);
//-- Print the results.
for (int i=0; i<n; i++) {
System.out.println(frequency[i] + " " + wrds[i]);
}
System.out.println("\nNumber of source words: " + wf.getWordCount());
System.out.println("\nNumber of unique words: " + n);
} catch (IOException iox) {
System.out.println(iox);
}
}
}//end class
|
The model (the logic of the program without any user interface) is implemented primarily in the WordFrequency class, which uses two utility classes: CompareByFrequency to do the sorting, and MutableInteger to record the frequency counts.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
// wordfreq/WordFrequency.java -- Print word frequency.
// Fred Swartz - October 2004
import java.io.*;
import java.util.*;
import java.util.regex.*;
/** Computes word frequency in source file; ignores words in ignore file.
* Uses Sets, Maps, ArrayList, regular expressions, BufferedReader.
* @author Fred Swartz
* @version 2004-10-17
*/
public class WordFrequency {
//-- Instance variables.
Set<String> m_ignoreWords; // Words to ignore.
Map<String, MutableInteger> m_wordFrequency; // Words -> frequency
int m_totalWords; // Total number of source words.
/** Constructor */
public WordFrequency() {
m_ignoreWords = new HashSet<String>();
m_wordFrequency = new HashMap<String, MutableInteger>();
m_totalWords = 0;
}
/** Reads file of words to ignore
*@param ignoreFile File of words to ignore.
*/
public void ignoreFile(File ignoreFile) throws IOException {
//-- Reused temporary variables.
String line;
//-- Read file of words to ignore.
BufferedReader ignoreRdr = new BufferedReader(new FileReader(ignoreFile));
while ((line = ignoreRdr.readLine()) != null) {
String[] words = line.trim().split("\\s+");
for (String word : words) {
m_ignoreWords.add(word);
}
}
ignoreRdr.close();
}
/** Record the frequency of words in the source file.
* May be called more than once. [Need definition of "word"].
*@param File of words to process.
*/
public void processFile(File sourceFile) throws IOException {
//-- Reused temporary variables.
String line;
String word;
Pattern wordPat = Pattern.compile("[A-za-z]+");
Matcher wordMatcher = wordPat.matcher("");
//-- Read source file. Count non-ignored words.
BufferedReader sourceRdr = new BufferedReader(new FileReader(sourceFile));
while ((line = sourceRdr.readLine()) != null) {
wordMatcher.reset(line);
while (wordMatcher.find()) {
word = wordMatcher.group().toLowerCase();
m_totalWords++;
if (!m_ignoreWords.contains(word)) {
MutableInteger value = m_wordFrequency.get(word);
if (value == null) { // Create new entry with count of 1.
m_wordFrequency.put(word, new MutableInteger(1));
} else { // Increment existing count by 1.
value.inc();
}
}
}
}
sourceRdr.close();
}
/** Returns number of words in the soure file(s).
*@return total number of words proccessed in all source files.
*/
public int getWordCount() {
return m_totalWords;
}
/** Returns the number of unique, non-ignored words, in the source file(s).
* This number should be used to for the size of the arrays that are
* passed to getWordFrequency.
*@return Number of unique non-ignored source words.
*/
public int getEntryCount() {
return m_wordFrequency.size();
}
/** Stores words and their corresponding frequencies in the parallel array
* parameters. The frequencies are sorted from low to high.
* The size of the arrays must be at least getEntryCount().
* @param words Unique words that were found in the source file(s).
* @param counts Frequency of words at corresponding index in words array.
*/
public void getWordFrequency(String[] words, int[] counts) {
//-- Sort entries by frequency
ArrayList<Map.Entry<String, MutableInteger>> entries
= new ArrayList<Map.Entry<String, MutableInteger>>(m_wordFrequency.entrySet());
Collections.sort(entries, new CompareByFrequency());
//-- Print frequency and word
for (int i=0; i<entries.size(); i++) {
Map.Entry<String, MutableInteger> ent = entries.get(i);
words[i] = ent.getKey();
counts[i] = ent.getValue().intValue();
}
}
}//end class
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
// wordfreq/CompareByFrequency.java
// Fred Swartz - October 2004
import java.util.*;
/////////////////////////////////////////////// class CompareByFrequency
/** For ordering words from least to most freqent.
* If frequency is equal, words are put in alphabetical order.
*/
class CompareByFrequency implements Comparator<Map.Entry<String, MutableInteger>> {
public int compare(Map.Entry<String, MutableInteger> obj1
, Map.Entry<String, MutableInteger> obj2) {
int c1 = obj1.getValue().intValue();
int c2 = obj2.getValue().intValue();
if (c1 < c2) {
return -1;
} else if (c1 > c2) {
return 1;
} else { // If counts are equal, compare keys alphabetically.
return obj1.getKey().compareTo(obj2.getKey());
}
}
}//end class CompareByFrequency
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
// wordfreq/MutableInteger.java
// Fred Swartz - October 2004
/////////////////////////////////////////////////// utility class MutableInteger
/** Utility class to keep int count because Java's data
* structures hold only Objects, not basic types and Integer is immutable.
*/
class MutableInteger {
private int m_value;
/** Constructor */
public MutableInteger(int value) {
m_value = value;
}
/** Return int value. */
public int intValue() {
return m_value;
}
/** Increment value */
public void inc() {
m_value++;
}
}//end class
|