| 1 | package net.digitaltsunami.word.trie; |
| 2 | |
| 3 | import java.io.BufferedReader; |
| 4 | import java.io.File; |
| 5 | import java.io.FileReader; |
| 6 | import java.io.IOException; |
| 7 | import java.io.InputStream; |
| 8 | import java.io.InputStreamReader; |
| 9 | import java.util.HashMap; |
| 10 | import java.util.Map; |
| 11 | |
| 12 | /** |
| 13 | * Build an expected value weight table. |
| 14 | * |
| 15 | * @author dhagberg |
| 16 | * |
| 17 | */ |
| 18 | public class ExpectedValueWeightTable { |
| 19 | private Map<Character, Map<Character, Float>> weightsByChar; |
| 20 | |
| 21 | public static final Float MIN_WEIGHT = new Float(0.0); |
| 22 | public static final float MIN_WEIGHT_PRIMITIVE = 0.0f; |
| 23 | public static final char ROOT_CHAR_VAL = '\0'; |
| 24 | |
| 25 | /** |
| 26 | * TODO comments NO " around character values. |
| 27 | * |
| 28 | * @param weightTableCsvStream |
| 29 | * @throws IOException |
| 30 | */ |
| 31 | public ExpectedValueWeightTable(InputStream weightTableCsvStream) throws IOException { |
| 32 | this(new BufferedReader(new InputStreamReader(weightTableCsvStream))); |
| 33 | } |
| 34 | |
| 35 | /** |
| 36 | * TODO comments NO " around character values. |
| 37 | * |
| 38 | * @param weightTableCsvStream |
| 39 | * @throws IOException |
| 40 | */ |
| 41 | public ExpectedValueWeightTable(File weightTableCsvFile) throws IOException { |
| 42 | this(new BufferedReader(new FileReader(weightTableCsvFile))); |
| 43 | } |
| 44 | |
| 45 | /** |
| 46 | * TODO comments NO " around character values. |
| 47 | * |
| 48 | * @param weightTableCsvStream |
| 49 | * @throws IOException |
| 50 | */ |
| 51 | public ExpectedValueWeightTable(BufferedReader csvReader) throws IOException { |
| 52 | String line = csvReader.readLine(); |
| 53 | if (line == null) { |
| 54 | // TODO: Need new exception class for this. |
| 55 | throw new IOException("File is empty"); |
| 56 | } |
| 57 | weightsByChar = new HashMap<Character, Map<Character, Float>>(); |
| 58 | // Build the outer map of all observed characters for the corpus. |
| 59 | String[] header = line.split(","); |
| 60 | // Skip the first column and the last two columns. |
| 61 | for (int i = 1; i < header.length - 2; i++) { |
| 62 | Character key = header[i].equals("FIRST") ? ROOT_CHAR_VAL : header[i].charAt(0); |
| 63 | weightsByChar.put(key, new HashMap<Character, Float>()); |
| 64 | } |
| 65 | // Total characters following character is in last column; |
| 66 | int totalColIdx = header.length - 1; |
| 67 | |
| 68 | // Iterate over rest of file and for each line representing a character, |
| 69 | // add an entry to the map for that character with the follow on |
| 70 | // character and the expected value that the character follows the |
| 71 | // current character. |
| 72 | while ((line = csvReader.readLine()) != null) { |
| 73 | String[] data = line.split(","); |
| 74 | Character key = data[0].equals("FIRST") ? ROOT_CHAR_VAL : data[0].charAt(0); |
| 75 | Map<Character, Float> weightsForChar = weightsByChar.get(key); |
| 76 | // first 2 and last 2 columns do not contain follow on data. |
| 77 | float totalFreq = Float.parseFloat(data[totalColIdx]); |
| 78 | for (int i = 2; i < data.length - 2; i++) { |
| 79 | float freq = Float.parseFloat(data[i]); |
| 80 | Float weight = totalFreq == 0 ? MIN_WEIGHT : freq / totalFreq; |
| 81 | weightsForChar.put(header[i].charAt(0), weight); |
| 82 | } |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | public float getExpectedValue(Character current, Character next) { |
| 87 | Map<Character, Float> weightsForChar = weightsByChar.get(current); |
| 88 | if (weightsForChar != null) { |
| 89 | Float followOnWeight = weightsForChar.get(next); |
| 90 | if (followOnWeight != null) { |
| 91 | return followOnWeight; |
| 92 | } |
| 93 | } |
| 94 | return MIN_WEIGHT_PRIMITIVE; |
| 95 | } |
| 96 | } |