This page was saved using WebZIP 6.0.8.918 (Unregistered) on 01/20/05 오후 3:27:46.
Address: http://www.leepoint.net/notes-java/25data/50collections/30maps/ex-wordfreq.html
Title: Java Notes: Example - WordFrequency  •  Size: 10747  •  Last Modified: Fri, 14 Jan 2005 00:25:19 GMT

Java Notes: Example - WordFrequency

This program reads files of words and lists their frequencies. A file of words to ignore can also be supplied. This example illustrates the use of Java 5's generic data structures (Sets, Maps, and ArrayList), regular expressions (split(), Pattern, and Matcher), and BufferedReader. A command-line interface is given, but an interesting exercise would be to supply a GUI interface.

A text command-line interface

  1 
  2 
  3 
  4 
  5 
  6 
  7 
  8 
  9 
 10 
 11 
 12 
 13 
 14 
 15 
 16 
 17 
 18 
 19 
 20 
 21 
 22 
 23 
 24 
 25 
 26 
 27 
 28 
 29 
 30 
 31 
 32 
 33 
 34 
 35 
 36 
 37 
 38 
 39 
 40 
 41 
 42 
 43 
 44 
 45 
// wordfreq/WordFrequencyCmd.java -- Main program to test WordFrequency
// Fred Swartz - October 2004

import java.io.*;
import java.util.*;

/** Prints word frequency in source file. Ignores words in ignore file.
 * Uses Sets, Maps, ArrayList, regular expressions, BufferedReader.
 * @author Fred Swartz
 * @version 2004-10-17
 */
class WordFrequencyCmd {
    //========================================================= main
    /** All code is put into main */
    public static void main(String[] args) {
        //-- Expects two file names on run command.
        if (args.length != 2) {
            System.out.println("Usage: java WordFrequency inputFile ignoreFile");
            System.exit(1);
        }
        
        try {
            //-- Supply to files to a WordFrequency object.
            WordFrequency wf = new WordFrequency();
            wf.ignoreFile(new File(args[1]));
            wf.processFile(new File(args[0]));
            
            //-- Get the results.
            int n = wf.getEntryCount();
            String[] wrds = new String[n];
            int[] frequency = new int[n];
            wf.getWordFrequency(wrds, frequency);
            
            //-- Print the results.
            for (int i=0; i<n; i++) {
                System.out.println(frequency[i] + " " + wrds[i]);
            }
            
            System.out.println("\nNumber of source words: " + wf.getWordCount());
            System.out.println("\nNumber of unique words: " + n);
        } catch (IOException iox) {
            System.out.println(iox);
        }
    }
}//end class

The "Model" -- Word frequency without any user interface

The model (the logic of the program without any user interface) is implemented primarily in the WordFrequency class, which uses two utility classes: CompareByFrequency to do the sorting, and MutableInteger to record the frequency counts.

  1 
  2 
  3 
  4 
  5 
  6 
  7 
  8 
  9 
 10 
 11 
 12 
 13 
 14 
 15 
 16 
 17 
 18 
 19 
 20 
 21 
 22 
 23 
 24 
 25 
 26 
 27 
 28 
 29 
 30 
 31 
 32 
 33 
 34 
 35 
 36 
 37 
 38 
 39 
 40 
 41 
 42 
 43 
 44 
 45 
 46 
 47 
 48 
 49 
 50 
 51 
 52 
 53 
 54 
 55 
 56 
 57 
 58 
 59 
 60 
 61 
 62 
 63 
 64 
 65 
 66 
 67 
 68 
 69 
 70 
 71 
 72 
 73 
 74 
 75 
 76 
 77 
 78 
 79 
 80 
 81 
 82 
 83 
 84 
 85 
 86 
 87 
 88 
 89 
 90 
 91 
 92 
 93 
 94 
 95 
 96 
 97 
 98 
 99 
100 
101 
102 
103 
104 
105 
106 
107 
108 
109 
110 
111 
112 
113 
114 
115 
// wordfreq/WordFrequency.java -- Print word frequency.
// Fred Swartz - October 2004

import java.io.*;
import java.util.*;
import java.util.regex.*;

/** Computes word frequency in source file; ignores words in ignore file.
 * Uses Sets, Maps, ArrayList, regular expressions, BufferedReader.
 * @author Fred Swartz
 * @version 2004-10-17
 */
public class WordFrequency {
    //-- Instance variables.
    Set<String> m_ignoreWords;    // Words to ignore.
    Map<String, MutableInteger> m_wordFrequency;  // Words -> frequency
    int m_totalWords;     // Total number of source words.
    
    
    /** Constructor */
    public WordFrequency() {
        m_ignoreWords   = new HashSet<String>();
        m_wordFrequency = new HashMap<String, MutableInteger>();
        m_totalWords = 0;
    }
    
    
    /** Reads file of words to ignore
     *@param ignoreFile File of words to ignore.
     */
    public void ignoreFile(File ignoreFile) throws IOException {
        //-- Reused temporary variables.
        String line;
        
        //-- Read file of words to ignore.
        BufferedReader ignoreRdr = new BufferedReader(new FileReader(ignoreFile));
        while ((line = ignoreRdr.readLine()) != null) {
            String[] words = line.trim().split("\\s+");
            for (String word : words) {
                m_ignoreWords.add(word);                
            }
        }
        ignoreRdr.close();
    }
    
    
    /** Record the frequency of words in the source file.
     *  May be called more than once. [Need definition of "word"].
     *@param File of words to process.
     */
    public void processFile(File sourceFile) throws IOException {
        //-- Reused temporary variables.
        String line;
        String word;
        
        Pattern wordPat     = Pattern.compile("[A-za-z]+");
        Matcher wordMatcher = wordPat.matcher("");
        
        //-- Read source file.   Count non-ignored words.
        BufferedReader sourceRdr = new BufferedReader(new FileReader(sourceFile));
        while ((line = sourceRdr.readLine()) != null) {
            wordMatcher.reset(line);
            while (wordMatcher.find()) {
                word = wordMatcher.group().toLowerCase();
                m_totalWords++;
                if (!m_ignoreWords.contains(word)) {
                    MutableInteger value = m_wordFrequency.get(word);
                    if (value == null) {     // Create new entry with count of 1.
                        m_wordFrequency.put(word, new MutableInteger(1));
                    } else {                 // Increment existing count by 1.
                        value.inc();
                    }
                }

            }
        }
        sourceRdr.close();
    }
        
    /** Returns number of words in the soure file(s).
     *@return total number of words proccessed in all source files.
     */
    public int getWordCount() {
        return m_totalWords;
    }
        
    /** Returns the number of unique, non-ignored words, in the source file(s).
     *  This number should be used to for the size of the arrays that are
     *  passed to getWordFrequency.
     *@return Number of unique non-ignored source words.
     */
    public int getEntryCount() {
        return m_wordFrequency.size();
    }
        
    /** Stores words and their corresponding frequencies in the parallel array
     *  parameters.  The frequencies are sorted from low to high.
     *  The size of the arrays must be at least getEntryCount().
     * @param words Unique words that were found in the source file(s).
     * @param counts Frequency of words at corresponding index in words array.
     */
    public void getWordFrequency(String[] words, int[] counts) {
        //-- Sort entries by frequency
        ArrayList<Map.Entry<String, MutableInteger>> entries 
             = new ArrayList<Map.Entry<String, MutableInteger>>(m_wordFrequency.entrySet());
        Collections.sort(entries, new CompareByFrequency());
        
        //-- Print frequency and word
        for (int i=0; i<entries.size(); i++) {
            Map.Entry<String, MutableInteger> ent = entries.get(i);
            words[i]  = ent.getKey();
            counts[i] = ent.getValue().intValue();
        }
    }
}//end class
  1 
  2 
  3 
  4 
  5 
  6 
  7 
  8 
  9 
 10 
 11 
 12 
 13 
 14 
 15 
 16 
 17 
 18 
 19 
 20 
 21 
 22 
 23 
// wordfreq/CompareByFrequency.java
// Fred Swartz - October 2004

import java.util.*;

/////////////////////////////////////////////// class CompareByFrequency
/** For ordering words from least to most freqent.
 * If frequency is equal, words are put in alphabetical order.
 */
class CompareByFrequency implements Comparator<Map.Entry<String, MutableInteger>> {
    public int compare(Map.Entry<String, MutableInteger> obj1
                     , Map.Entry<String, MutableInteger> obj2) {
        int c1 = obj1.getValue().intValue();
        int c2 = obj2.getValue().intValue();
        if (c1 < c2) {
            return -1;
        } else if (c1 > c2) {
            return 1;
        } else { // If counts are equal, compare keys alphabetically.
            return obj1.getKey().compareTo(obj2.getKey());
        }
    }
}//end class CompareByFrequency
  1 
  2 
  3 
  4 
  5 
  6 
  7 
  8 
  9 
 10 
 11 
 12 
 13 
 14 
 15 
 16 
 17 
 18 
 19 
 20 
 21 
 22 
 23 
 24 
 25 
// wordfreq/MutableInteger.java
// Fred Swartz - October 2004

/////////////////////////////////////////////////// utility class MutableInteger
/** Utility class to keep int count because Java's data
 * structures hold only Objects, not basic types and Integer is immutable.
 */
class MutableInteger {
    private int m_value;
    
    /** Constructor */
    public MutableInteger(int value) {
        m_value = value;
    }
    
    /** Return int value. */
    public int intValue() {
        return m_value;
    }
    
    /** Increment value */
    public void inc() {
        m_value++;
    }
}//end class