Pages

Thursday, April 19, 2012

A Generic Text Comparison Tool Implementation with LCS Approach





Detecting and showing differences of two texts (especially having hundreds or thousands of lines) is a common problem. Using pure java.lang.String class methods may be a solution, but the most important issue for that kind of operations, "performance" will not be satisfactory. We want an efficient solution which may have a view as below: 

Text Difference Tool Example
The problem contains two parts: 
  • Detecting differences of two texts: For detecting differences, an efficient dynamic algorithm of LCS (Longest Common Subsequence) used in this solution. This solution has O(text1WordCount * text2WordCount) complexity and coded as "longestCommonSubsequence" method below.
  • Visualizing the differences: For visualizing, an HTML tag based approach is used, which marks new words of text2 with green color and old words of text1 with red color. This solution has O(changedWordsCount * (text1WordCount+text2WordCount)) complexity and coded as "markTextDifferences" method below.
Note1: For simplicity, "normalizeText" method is used for removing \n, \t and multiple space characters. 
Note2: This class was created as a Vaadin component. But "longestCommonSubsequence" is pure generic and "markTextDifferences" method is generic on HTML based visual components, so they can also be used with different frameworks.


  1. import java.util.ArrayList;
  2. import com.vaadin.ui.CustomComponent;
  3. import com.vaadin.ui.Label;
  4. import com.vaadin.ui.Layout;
  5. import com.vaadin.ui.VerticalLayout;
  6. /**
  7.  * Text comparison component which marks differences of two texts with colors.
  8.  *
  9.  * @author cb
  10.  */
  11. public class TextCompareComponent extends CustomComponent {
  12.     private Layout mainLayout = new VerticalLayout();
  13.     private ArrayList<String> longestCommonSubsequenceList;
  14.     private static final String INSERT_COLOR = "#99FFCC";
  15.     private static final String DELETE_COLOR = "#CB6D6D";
  16.     public TextCompareComponent(String text1, String text2) {
  17.        
  18.         text1 = normalizeText(text1);
  19.         text2 = normalizeText(text2);
  20.         this.longestCommonSubsequenceList = longestCommonSubsequence(text1, text2);
  21.         String result = markTextDifferences(text1, text2,
  22.             longestCommonSubsequenceList, INSERT_COLOR, DELETE_COLOR);
  23.        
  24.         Label label = new Label(result, Label.CONTENT_XHTML);
  25.         mainLayout.addComponent(label);
  26.         setCompositionRoot(mainLayout);
  27.     }
  28.     /**
  29.      * Finds a list of longest common subsequences (lcs) of given two texts.
  30.      *
  31.      * @param text1
  32.      * @param text2
  33.      * @return - longest common subsequence list
  34.      */
  35.     private ArrayList<String> longestCommonSubsequence(String text1, String text2) {
  36.         String[] text1Words = text1.split(" ");
  37.         String[] text2Words = text2.split(" ");
  38.         int text1WordCount = text1Words.length;
  39.         int text2WordCount = text2Words.length;
  40.        
  41.         int[][] solutionMatrix = new int[text1WordCount + 1][text2WordCount + 1];
  42.        
  43.         for (int i = text1WordCount - 1; i >= 0; i--) {
  44.             for (int j = text2WordCount - 1; j >= 0; j--) {
  45.                 if (text1Words[i].equals(text2Words[j])) {
  46.                     solutionMatrix[i][j] = solutionMatrix[i + 1][j + 1] + 1;
  47.                 }
  48.                 else {
  49.                     solutionMatrix[i][j] = Math.max(solutionMatrix[i + 1][j],
  50.                         solutionMatrix[i][j + 1]);
  51.                 }
  52.             }
  53.         }
  54.        
  55.         int i = 0, j = 0;
  56.         ArrayList<String> lcsResultList = new ArrayList<String>();
  57.         while (i < text1WordCount && j < text2WordCount) {
  58.             if (text1Words[i].equals(text2Words[j])) {
  59.                 lcsResultList.add(text2Words[j]);
  60.                 i++;
  61.                 j++;
  62.             }
  63.             else if (solutionMatrix[i + 1][j] >= solutionMatrix[i][j + 1]) {
  64.                 i++;
  65.             }
  66.             else {
  67.                 j++;
  68.             }
  69.         }
  70.         return lcsResultList;
  71.     }
  72.    
  73.     /**
  74.      * Normalizes given string by deleting \n, \t and extra spaces.
  75.      *
  76.      * @param text - initial string
  77.      * @return - normalized string
  78.      */
  79.     private String normalizeText(String text) {
  80.        
  81.         text = text.trim();
  82.         text = text.replace("\n", " ");
  83.         text = text.replace("\t", " ");
  84.        
  85.         while (text.contains("  ")) {
  86.             text = text.replace("  ", " ");
  87.         }
  88.         return text;
  89.     }
  90.     /**
  91.      * Returns colored inserted/deleted text representation of given two texts.
  92.      * Uses longestCommonSubsequenceList to determine colored sections.
  93.      *
  94.      * @param text1
  95.      * @param text2
  96.      * @param lcsList
  97.      * @param insertColor
  98.      * @param deleteColor
  99.      * @return - colored result text
  100.      */
  101.     private String markTextDifferences(String text1, String text2,
  102.         ArrayList<String> lcsList, String insertColor, String deleteColor) {
  103.         StringBuffer stringBuffer = new StringBuffer();
  104.         if (text1 != null && lcsList != null) {
  105.             String[] text1Words = text1.split(" ");
  106.             String[] text2Words = text2.split(" ");
  107.             int i = 0, j = 0, word1LastIndex = 0, word2LastIndex = 0;
  108.             for (int k = 0; k < lcsList.size(); k++) {
  109.                 for (i = word1LastIndex, j = word2LastIndex;
  110.                     i < text1Words.length && j < text2Words.length;) {
  111.                     if (text1Words[i].equals(lcsList.get(k)) &&
  112.                         text2Words[j].equals(lcsList.get(k))) {
  113.                         stringBuffer.append("<SPAN>" + lcsList.get(k) + " </SPAN>");
  114.                         word1LastIndex = i + 1;
  115.                         word2LastIndex = j + 1;
  116.                         i = text1Words.length;
  117.                         j = text2Words.length;
  118.                     }
  119.                     else if (!text1Words[i].equals(lcsList.get(k))) {
  120.                         for (; i < text1Words.length &&
  121.                             !text1Words[i].equals(lcsList.get(k)); i++) {
  122.                             stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  123.                                 deleteColor + "'>" + text1Words[i] + " </SPAN>");
  124.                         }
  125.                     } else if (!text2Words[j].equals(lcsList.get(k))) {
  126.                         for (; j < text2Words.length &&
  127.                             !text2Words[j].equals(lcsList.get(k)), j++) {
  128.                             stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  129.                                 insertColor + "'>" + text2Words[j] + " </SPAN>");
  130.                         }
  131.                     }
  132.                 }
  133.             }
  134.             for (; word1LastIndex < text1Words.length; word1LastIndex++) {
  135.                 stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  136.                     deleteColor + "'>" + text1Words[word1LastIndex] + " </SPAN>");
  137.             }
  138.             for (; word2LastIndex < text2Words.length; word2LastIndex++) {
  139.                 stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  140.                     insertColor + "'>" + text2Words[word2LastIndex] + " </SPAN>");
  141.             }
  142.         }
  143.         return stringBuffer.toString();
  144.     }
  145. }

Wednesday, April 4, 2012

A Regular Expression HashMap Implementation in Java





Below is an implementation of a Regular Expression HashMap. It works with key-value pairs which the key is a regular expression. It compiles the key (regular expression) while adding (i.e. putting), so there is no compile time while getting. Once getting an element, you don't give regular expression; you give any possible value of a regular expression. 

As a result, this behaviour provides to map numerous values of a regular expression into the same value. The class does not depend to any external libraries, uses only default java.util. So, it will be used simply when a behaviour like that is required. 

  1. import java.util.ArrayList;
  2. import java.util.HashMap;
  3. import java.util.regex.Pattern;
  4. /**
  5.  * This class is an extended version of Java HashMap
  6.  * and includes pattern-value lists which are used to
  7.  * evaluate regular expression values. If given item
  8.  * is a regular expression, it is saved in regexp lists.
  9.  * If requested item matches with a regular expression,
  10.  * its value is get from regexp lists.
  11.  *
  12.  * @author cb
  13.  *
  14.  * @param <K> : Key of the map item.
  15.  * @param <V> : Value of the map item.
  16.  */
  17. public class RegExHashMap<K, V> extends HashMap<K, V> {
  18.     // list of regular expression patterns
  19.     private ArrayList<Pattern> regExPatterns = new ArrayList<Pattern>();
  20.     // list of regular expression values which match patterns
  21.     private ArrayList<V> regExValues = new ArrayList<V>();
  22.     /**
  23.      * Compile regular expression and add it to the regexp list as key.
  24.      */
  25.     @Override
  26.     public V put(K key, V value) {
  27.        
  28.         regExPatterns.add(Pattern.compile(key.toString()));
  29.         regExValues.add(value);
  30.         return value;
  31.     }
  32.     /**
  33.      * If requested value matches with a regular expression,
  34.      * returns it from regexp lists.
  35.      */
  36.     @Override
  37.     public V get(Object key) {
  38.         CharSequence cs = new String(key.toString());
  39.        
  40.         for (int i = 0; i < regExPatterns.size(); i++) {
  41.             if (regExPatterns.get(i).matcher(cs).matches()) {
  42.                
  43.                 return regExValues.get(i);
  44.             }
  45.         }
  46.         return super.get(key);
  47.     }
  48. }