| 1 | package net.digitaltsunami.word.sequence; |
| 2 | |
| 3 | /** |
| 4 | * Computes edit distance using the Levenshtein distance metric. Each edit |
| 5 | * (insertion, deletion, substitution) counts as one edit. |
| 6 | * <p> |
| 7 | * This implementation provides a basic count of edits without weights. |
| 8 | * |
| 9 | * @author dhagberg |
| 10 | */ |
| 11 | public class LevenshteinDistanceStrategy implements EditDistanceStrategy { |
| 12 | |
| 13 | /* |
| 14 | * (non-Javadoc) |
| 15 | * |
| 16 | * @see |
| 17 | * net.digitaltsunami.word.sequence.EditDistanceStrategy#getEditCount( |
| 18 | * java.lang.String, java.lang.String) |
| 19 | */ |
| 20 | @Override |
| 21 | public int getEditCount(String fromTerm, String toTerm) { |
| 22 | return calculateEditCount(fromTerm, toTerm); |
| 23 | } |
| 24 | |
| 25 | /* |
| 26 | * (non-Javadoc) |
| 27 | * |
| 28 | * @see |
| 29 | * net.digitaltsunami.word.sequence.EditDistanceStrategy#getEditDistance |
| 30 | * (java.lang.String, java.lang.String) |
| 31 | */ |
| 32 | @Override |
| 33 | public double getEditDistance(String fromTerm, String toTerm) { |
| 34 | return calculateEditCount(fromTerm, toTerm); |
| 35 | } |
| 36 | |
| 37 | |
| 38 | /** |
| 39 | * Compute and return the Levenshtein edit distance between the two provided |
| 40 | * sequences. Each edit (insertion, deletion, substitution) counts as one |
| 41 | * edit. The total number of these edit operations required to convert |
| 42 | * string 1 to string 2 is calculated and returned. |
| 43 | * |
| 44 | * @param fromTerm |
| 45 | * @param toTerm |
| 46 | * @return total number of edits required to convert string 1 to string 2. |
| 47 | */ |
| 48 | public static int calculateEditCount(String fromTerm, String toTerm) { |
| 49 | int nRows = fromTerm.length() + 1; |
| 50 | int nCols = toTerm.length() + 1; |
| 51 | /* |
| 52 | * Create matrix to store edit operations. Each dimension is length + 1 |
| 53 | * to allow for full edit distance values. |
| 54 | */ |
| 55 | int editMatrix[][] = new int[nRows][nCols]; |
| 56 | /* |
| 57 | * Initialize first row and column to contain edit distances as if the |
| 58 | * other string were empty. |
| 59 | */ |
| 60 | for (int r = 0; r < nRows; r++) { |
| 61 | editMatrix[r][0] = r; |
| 62 | } |
| 63 | for (int c = 0; c < nCols; c++) { |
| 64 | editMatrix[0][c] = c; |
| 65 | } |
| 66 | /* |
| 67 | * Compare each character of the string against one another and |
| 68 | * calculate the number of edits needed at each stage. The total number |
| 69 | * of edits will be in editMatrix[fromTerm.len][toTerm.len] |
| 70 | */ |
| 71 | for (int r = 1; r < nRows; r++) { |
| 72 | for (int c = 1; c < nCols; c++) { |
| 73 | if (fromTerm.charAt(r - 1) == toTerm.charAt(c - 1)) { |
| 74 | editMatrix[r][c] = editMatrix[r - 1][c - 1]; |
| 75 | } else { |
| 76 | editMatrix[r][c] = Math.min( |
| 77 | Math.min(editMatrix[r][c - 1] +1 , editMatrix[r - 1][c] + 1), |
| 78 | editMatrix[r - 1][c - 1] + 1); |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | // printEditMatrix(fromTerm.toCharArray(), toTerm.toCharArray(), |
| 84 | // editMatrix); |
| 85 | return editMatrix[nRows - 1][nCols - 1]; |
| 86 | } |
| 87 | } |