-
Notifications
You must be signed in to change notification settings - Fork 0
/
Program1.java
84 lines (74 loc) · 2.9 KB
/
Program1.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*
* Program1.java
*
* The driver program for CSCI 241's Program 1
*
* Reads a text file and calls Wordifier to learn new words
*
* Do not modify this file.
*
*
* ----------------------------------------------------------------------------
*
* usage:
*
* java Program1 inputFilename countThreshold probabilityThreshold dictionary
*
* where the arguments are
*
* inputFileName a plaintext file from which your program will estimate counts
* countThreshold two tokens will only be merged if they appear in sequence at least this many times
* probabilityThreshold two tokens will only be merged if their bigram product is at least this high
* dictionaryFilename a dictionary against which to check whether your learned words are actually words
*
*/
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.HashMap;
import java.util.HashSet;
public class Program1 {
public static void main( String[] args ) {
if( args.length != 4 ) {
System.err.println("Error: Wrong number of arguments");
System.exit(1);
}
String inputFileName = args[0];
int countThreshold = Integer.parseInt(args[1]);
double probabilityThreshold = Double.parseDouble(args[2]);
String dictionaryFilename = args[3];
// Read input files
LinkedList<String> data = Wordifier.loadSentences(inputFileName);
HashSet<String> dictionary = Wordifier.loadDictionary(dictionaryFilename);
// Iteratively merge tokens and resegment the text
boolean done = false;
int round = 0;
while( !done ) {
// Init round
round++;
HashMap<String,Integer> bigramCounts = new HashMap<String,Integer>();
HashMap<String,Double> bigramProbs = new HashMap<String,Double>();
HashMap<String,Double> leftUnigramProbs = new HashMap<String,Double>();
HashMap<String,Double> rightUnigramProbs = new HashMap<String,Double>();
System.out.format("= Round %d:%n",round);
// Get bigram product scores
Wordifier.computeCounts(data,bigramCounts);
Wordifier.convertCountsToProbabilities(bigramCounts,bigramProbs,leftUnigramProbs,rightUnigramProbs);
HashMap<String,Double> scores = Wordifier.getScores(bigramProbs,leftUnigramProbs,rightUnigramProbs);
// Identify the new words (bigrams exceeding the thresholds)
HashSet<String> newWords = Wordifier.findNewWords(bigramCounts,scores,countThreshold,probabilityThreshold);
System.out.println("\tAdded " + newWords.size() + " new tokens");
if( newWords.size() == 0 ) {
// No tokens identified to merge; our job here is done
done = true;
} else {
// Otherwise resegment the data and go to the next round
data = Wordifier.resegment(data,newWords);
System.out.println(data.size() + " tokens");
}
}
// Evaluate whether the learned words are actual words
HashMap<String,Integer> vocab = Wordifier.getVocabulary(data);
Wordifier.printNumWordsDiscovered(vocab,dictionary);
return;
}
}