package com.smart.basic; public class TernarySearchTrie { public final class TSTNode { /** * 节点的值,词原文,词性,词频等 */ public String data = null; /** * 低节点 */ protected TSTNode loNode; /** * 相等节点 */ protected TSTNode eqNode; /** * 高节点 */ protected TSTNode hiNode; /** * 节点的字符 */ protected char splitchar; /** * 构造方法 * * @param splitchar * 该节点表示的字符 */ protected TSTNode(char splitchar) { this.splitchar = splitchar; } public String toString() { return "splitchar:" + splitchar; } } protected TSTNode rootNode; /** * 查询 * @param word 要查询的单词 * @return 未找到返回null,找到返回单词的结束节点 */ protected TSTNode getNode(String word) { if (null == word) { return null; } int len = word.length(); if (len == 0) return null; TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置 int charIndex = 0; // 表示当前要比较的字符在Key中的位置 char cmpChar = word.charAt(charIndex); int charComp; while (true) { if (currentNode == null) {// 没找到 return null; } charComp = cmpChar - currentNode.splitchar; if (charComp == 0) {//相等往下走 charIndex++; if (charIndex == len) {//找到了 return currentNode; } else { cmpChar = word.charAt(charIndex);//词往下走 } currentNode = currentNode.eqNode; } else if (charComp < 0) {//小于往左走 currentNode = currentNode.loNode; } else {//大于往右走 currentNode = currentNode.hiNode; } } } /** * 向词典添加单词 * @param word 单词 * @return 单词的结束节点 */ protected TSTNode addWord(String word) { if (null == word) { throw new NullPointerException("空指针异常"); } int charIndex = 0; if (null == rootNode) { rootNode = new TSTNode(word.charAt(0)); } TSTNode currentNode = rootNode; while (true) { int charComp = word.charAt(charIndex) - currentNode.splitchar; if (charComp == 0) { charIndex++; if (charIndex == word.length()) { return currentNode; } if (null == currentNode.eqNode) { currentNode.eqNode = new TSTNode(word.charAt(charIndex)); } currentNode = currentNode.eqNode; } else if (charComp < 0) { if (null == currentNode.loNode) { currentNode.loNode = new TSTNode(word.charAt(charIndex)); } currentNode = currentNode.loNode; } else { if (null == currentNode.hiNode) { currentNode.hiNode = new TSTNode(word.charAt(charIndex)); } currentNode = currentNode.hiNode; } } } }
package com.smart.basic; import java.util.ArrayList; import java.util.List; public class TSTMaxMatch extends TernarySearchTrie { private int matchEnglish(int start, String sentence) { int i = start; for (; i < sentence.length();) { char c = sentence.charAt(i); if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') { ++i; } else { break; } } return i; } private int matchNum(int start, String sentence) { int i = start; for (; i < sentence.length();) { char c = sentence.charAt(i); if (c >= '0' && c <= '9') { ++i; } else { break; } } return i; } public List<String> tag(String sentence){ List<String> words = new ArrayList<String>(); for (int i = 0; i < sentence.length();) { String w = maxMatch(sentence, i); if (!"".equals(w)) { words.add(w); i += w.length(); } else { words.add(sentence.substring(i, i + 1)); i++; } } return words; } /** * 正向最大长度匹配 * @param sentence * @param offset * @return */ public String maxMatch(String sentence, int offset) { String ret = ""; if (sentence == null || rootNode == null || "".equals(sentence)) { return ""; } int endIndex = matchEnglish(offset, sentence); if (endIndex != offset) { return sentence.substring(offset,endIndex); } endIndex = matchNum(offset, sentence); if (endIndex != offset) { return sentence.substring(offset,endIndex); } TSTNode currentNode = rootNode; int charIndex = offset; while (currentNode != null) { int charComp = sentence.charAt(charIndex) - currentNode.splitchar; if (charComp == 0) { charIndex++; if(currentNode.data != null){ ret = currentNode.data; } if (charIndex == sentence.length()) { return ret; } currentNode = currentNode.eqNode; } else if (charComp < 0) { currentNode = currentNode.loNode; } else { currentNode = currentNode.hiNode; } } return ret; } public static void main(String[] args) { TSTMaxMatch tree=new TSTMaxMatch(); tree.addWord("大学生").data="大学生"; tree.addWord("大学").data="大学"; tree.addWord("活动中心").data="活动中心"; List<String> ret =tree.tag("大学生活动中心"); for (int i = 0; i < ret.size(); i++) { System.out.println(ret.get(i)); } } }
package com.smart.basic; import java.util.ArrayList; import java.util.List; public class TSTBackMaxMatch extends TernarySearchTrie { /** * 匹配英文 * * @param sen * @param offset * @return */ private String matchEnglish(char[] sen, int offset) { int i = offset; for (; i >= 0;) { char ch = sen[i]; if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch == '\'') i--; else break; } String eng = subCharArray(sen, i + 1, offset + 1); return eng; } /** * 匹配数字 * * @param sen * @param offset * @return */ private String matchNum(char[] sen, int offset) { int i = offset; for (; i >= 0;) { char ch = sen[i]; if (ch >= '0' && ch <= '9') i--; else break; } String num = subCharArray(sen, i + 1, offset + 1); return num; } /** * 截取子串 * * @param sen * @param start * @param end * @return */ private String subCharArray(char[] sen, int start, int end) { char[] chs = new char[end - start]; if (start != end) { System.arraycopy(sen, start, chs, 0, end - start); } return String.valueOf(chs); } /** * 逆向最大长度匹配 * * @param sentence * @param offset * @return */ private String matchLongBackward(char[] sentence, int offset) { String ret = null; if (rootNode == null || sentence == null || sentence.length == 0) { return ret; } String eng = matchEnglish(sentence, offset); if (!"".equals(eng)) { return eng; } String num = matchNum(sentence, offset); if (!"".equals(num)) { return num; } int charIndex = offset; TSTNode currentNode = rootNode; while (true) { if (currentNode == null) { if (ret == null) { String singleCn = subCharArray(sentence, offset, offset + 1); return singleCn; } return ret; } int charComp = sentence[charIndex] - currentNode.splitchar; if (charComp == 0) { charIndex--; if (currentNode.data != null) { ret = currentNode.data; } if (charIndex < 0) { if (ret == null) { String singleCn = subCharArray(sentence, offset, offset + 1); return singleCn; } return ret; } currentNode = currentNode.eqNode; } else if (charComp < 0) { currentNode = currentNode.loNode; } else { currentNode = currentNode.hiNode; } } } /** * 切分 * * @param sentence * @return */ public List<String> tag(String sentence) { ArrayList<String> list = new ArrayList<String>(); char[] sen = sentence.toCharArray(); int offset = sentence.length() - 1; while (offset >= 0) { String word = matchLongBackward(sen, offset); if (word == null) { offset--; } else { offset -= word.length(); } if (word != null) {// 过滤掉空白字符 list.add(word); } } return list; } public static void main(String[] args) { TSTBackMaxMatch tree=new TSTBackMaxMatch(); tree.addWord("心中").data="中心"; tree.addWord("动活").data="活动"; tree.addWord("生学大").data="大学生"; tree.addWord("学大").data="大学"; List<String> words=tree.tag("大学生123活动abc中心"); for (String string : words) { System.out.println(string); } } }