| 1 | /** |
| 2 | * |
| 3 | */ |
| 4 | package net.digitaltsunami.word.sequence; |
| 5 | |
| 6 | /** |
| 7 | * Provides edit distance normalization based on length of terms, with the ratio |
| 8 | * of edits to length. This results in the same number of edits returning a |
| 9 | * closer match for longer terms as opposed to shorter terms. |
| 10 | * |
| 11 | * @author dhagberg |
| 12 | * |
| 13 | */ |
| 14 | public class TermLengthNormalization implements EditDistanceNormalization { |
| 15 | |
| 16 | /** |
| 17 | * Normalize edit distance between fromTerm and toTerm and return a value in |
| 18 | * the range [0,1] with a 0 being a complete mismatch (no characters in |
| 19 | * common) and 1 being an exact match. |
| 20 | * <p> |
| 21 | * Normalization is based on length of terms, with the ratio of edits to |
| 22 | * length. This results in the same number of edits returning a closer match |
| 23 | * for longer terms as opposed to shorter terms. See below for an example. |
| 24 | * |
| 25 | * <pre> |
| 26 | * From To Count Notes |
| 27 | * -------- ------- ----- --------------------------------- |
| 28 | * game g 0.25 Three deletions. Short word length |
| 29 | * gamelan game 0.57 Three deletions. Longer word length |
| 30 | * </pre> |
| 31 | * |
| 32 | * @param editCount |
| 33 | * number of edits to convert fromTerm to toTerm. |
| 34 | * @param fromTerm |
| 35 | * initial term used as baseline |
| 36 | * @param toTerm |
| 37 | * target term from which the edit count will be calculated. |
| 38 | * |
| 39 | * @return a value returned will be in the closed interval: [0, 1] with an |
| 40 | * identical term being 1 and decreasing towards zero as the |
| 41 | * difference in the terms increases. |
| 42 | * @see net.digitaltsunami.word.sequence.EditDistanceNormalization#getNormalizedEditDistance(double, |
| 43 | * java.lang.String, java.lang.String) |
| 44 | */ |
| 45 | @Override |
| 46 | public double getNormalizedEditDistance(double editCount, String fromTerm, String toTerm) { |
| 47 | return TermLengthNormalization.normalizeOnTermLength(editCount, fromTerm, toTerm); |
| 48 | } |
| 49 | |
| 50 | /** |
| 51 | * Normalize edit distance based on term lengths. The provided edit count is |
| 52 | * normalized to return a value in the range [0,1] with a 0 being a complete |
| 53 | * mismatch (no characters in common) and 1 being an exact match. |
| 54 | * <p> |
| 55 | * Normalization is based on length of terms, with the ratio of edits to |
| 56 | * length. This results in the same number of edits returning a closer match |
| 57 | * for longer terms as opposed to shorter terms. See below for an example. |
| 58 | * <pre> |
| 59 | * From To Count Notes |
| 60 | * -------- ------- ----- --------------------------------- |
| 61 | * game g 0.25 Three deletions. Short word length |
| 62 | * gamelan game 0.57 Three deletions. Longer word length |
| 63 | * </pre> |
| 64 | * |
| 65 | * |
| 66 | * @param editCount |
| 67 | * number of edits required to convert fromTerm to toTerm. |
| 68 | * @param fromTerm |
| 69 | * initial term used as baseline |
| 70 | * @param toTerm |
| 71 | * target term from which the edit count will be calculated. |
| 72 | * @return a value returned will be in the closed interval: [0, 1] with an |
| 73 | * identical term being 1 and decreasing towards zero as the |
| 74 | * difference in the terms increases. |
| 75 | */ |
| 76 | protected static double normalizeOnTermLength(double editCount, String fromTerm, String toTerm) { |
| 77 | int minLen = Math.min(fromTerm.length(), toTerm.length()); |
| 78 | int maxLen = Math.max(fromTerm.length(), toTerm.length()); |
| 79 | return ((maxLen - editCount) * minLen) / (maxLen * minLen); |
| 80 | } |
| 81 | } |