| 1 | package net.digitaltsunami.word.sequence; |
| 2 | |
| 3 | /** |
| 4 | * Utility methods for processing of sequences. |
| 5 | * |
| 6 | * @author dhagberg |
| 7 | * |
| 8 | */ |
| 9 | public class Sequence { |
| 10 | /** |
| 11 | * Returns one of possibly many longest subsequences of characters common to |
| 12 | * both input strings. As a subsequence, characters are not required to be |
| 13 | * contiguous. |
| 14 | * |
| 15 | * @param string1 |
| 16 | * @param string2 |
| 17 | * @return a string from the set of 0 to many strings, the length of each |
| 18 | * being the longest subsequence found. |
| 19 | */ |
| 20 | public static String getALongestCommonSubsequence(String string1, String string2) { |
| 21 | /* |
| 22 | * Matrix containing results of iterations looking for common |
| 23 | * characters. Arrays for results are 1 greater than length of input |
| 24 | * strings to accommodate 0s in first row and column. |
| 25 | */ |
| 26 | int nRows = string1.length() + 1; |
| 27 | int nCols = string2.length() + 1; |
| 28 | int[][] lcsMatrix = new int[nRows][nCols]; |
| 29 | /* |
| 30 | * Initialize with -1 to indicate value has not yet been determined. The |
| 31 | * first row and column will be will be skipped as they contain 0s used |
| 32 | * for the comparison. |
| 33 | */ |
| 34 | for (int r = 1; r < nRows; r++) { |
| 35 | for (int c = 1; c < nCols; c++) { |
| 36 | lcsMatrix[r][c] = -1; // O(r x c) |
| 37 | } |
| 38 | } |
| 39 | /* |
| 40 | * Generate the LCS values for each row and column |
| 41 | */ |
| 42 | for (int r = 1; r < nRows; r++) { |
| 43 | for (int c = 1; c < nCols; c++) { |
| 44 | getPrefixLenForPosition(string1, string2, r, c, lcsMatrix); // O(r x c) |
| 45 | } |
| 46 | } |
| 47 | return getLongestSequence(string1, string2, lcsMatrix); // O(r + c) |
| 48 | } |
| 49 | |
| 50 | /** |
| 51 | * Calculate the common subsequence length for the current position within |
| 52 | * the LCS matrix. The value will be returned as well as updated within the |
| 53 | * results matrix. |
| 54 | * <p> |
| 55 | * The indexes into the strings will be one less than the provided row and |
| 56 | * columns to allow for the zeros row and column. |
| 57 | * |
| 58 | * @param s1 |
| 59 | * String one. Each character represents a row. |
| 60 | * @param s2 |
| 61 | * String two. Each character represents a column. |
| 62 | * @param row |
| 63 | * current row index into the results matrix. |
| 64 | * @param col |
| 65 | * current column index into the results matrix. |
| 66 | * @param results |
| 67 | * current results matrix populated up to [row,col]. |
| 68 | * |
| 69 | * @return length of longest common subsequence for the given row and |
| 70 | * column. |
| 71 | */ |
| 72 | private static int getPrefixLenForPosition(String s1, String s2, int row, int col, int[][] results) { |
| 73 | // All 0th row and column results are 0. |
| 74 | if (row == 0 && col == 0) { |
| 75 | return 0; |
| 76 | } |
| 77 | if (results[row][col] > -1) { |
| 78 | // Results have already been calculated, return previously |
| 79 | // calculated prefix len. |
| 80 | return results[row][col]; |
| 81 | } |
| 82 | |
| 83 | // Index into strings is one less to account for 0's row and column. |
| 84 | if (s1.charAt(row - 1) == s2.charAt(col - 1)) { |
| 85 | // Match found, increase prefix len. |
| 86 | results[row][col] = results[row - 1][col - 1] + 1; |
| 87 | } else { |
| 88 | if (results[row - 1][col] > results[row][col - 1]) { |
| 89 | results[row][col] = results[row - 1][col]; |
| 90 | } else { |
| 91 | results[row][col] = Math.max(results[row - 1][col], results[row][col - 1]); |
| 92 | } |
| 93 | } |
| 94 | return results[row][col]; |
| 95 | } |
| 96 | |
| 97 | /** |
| 98 | * Return one of possibly many longest subsequences using the pre-compiled |
| 99 | * results contained in the provided matrix (prefixLens). |
| 100 | * |
| 101 | * @param s1 |
| 102 | * @param s2 |
| 103 | * @param prefixLens |
| 104 | * @return |
| 105 | */ |
| 106 | private static String getLongestSequence(String s1, String s2, int[][] prefixLens) { |
| 107 | // Start row and col at last row and col. Matrix is s1.len + 1 x s2.len + 1 |
| 108 | int row = s1.length(); |
| 109 | int col = s2.length(); |
| 110 | StringBuilder sb = new StringBuilder(prefixLens[row][col]); |
| 111 | while (row > 0 && col > 0) { |
| 112 | if (prefixLens[row][col] > prefixLens[row - 1][col]) { |
| 113 | if (prefixLens[row][col] > prefixLens[row][col - 1]) { |
| 114 | // Change in prefix length means common char found, |
| 115 | // save and move to previous row and column |
| 116 | sb.append(s1.charAt(row - 1)); |
| 117 | row--; |
| 118 | col--; |
| 119 | } else { |
| 120 | col--; // Move left |
| 121 | } |
| 122 | } else { |
| 123 | // No change in length up, so move up. There could also have |
| 124 | // been no change in left, but can go only one way. |
| 125 | row--; |
| 126 | } |
| 127 | } |
| 128 | sb.reverse(); |
| 129 | return sb.toString(); |
| 130 | } |
| 131 | |
| 132 | private static void printLCSMatrix(String s1, String s2, int[][] prefixLengths) { |
| 133 | System.out.printf("\n%3s", ""); // Print first Row |
| 134 | System.out.printf("%2s", "0"); // Print 0 col hdr |
| 135 | for (int c = 0; c < s2.length(); c++) { // Print String 2 |
| 136 | System.out.printf("%3s", s2.charAt(c)); |
| 137 | } |
| 138 | System.out.printf("\n"); |
| 139 | System.out.printf("%2s", " 0"); // Print 0 row hdr |
| 140 | for (int c = 0; c <= s2.length(); c++) { // Print 0 for each column |
| 141 | System.out.printf("|%2s", "0"); |
| 142 | } |
| 143 | for (int r = 0; r < s1.length(); r++) { |
| 144 | System.out.printf("\n%2s", s1.charAt(r)); |
| 145 | System.out.printf("%3s", "0"); |
| 146 | for (int c = 0; c < s2.length(); c++) { |
| 147 | System.out.printf("|%2d", prefixLengths[r + 1][c + 1]); |
| 148 | } |
| 149 | } |
| 150 | System.out.println(""); |
| 151 | } |
| 152 | } |