正文
全文检索解决方案(lucene工具类以及sphinx相关资料)
小程序:扫一扫查出行
【扫一扫了解最新限行尾号】
复制小程序
【扫一扫了解最新限行尾号】
复制小程序
介绍两种全文检索的技术。
1、 lucene+ 中文分词(IK)
关于lucene的原理,在这里可以得到很好的学习。
http://www.blogjava.net/zhyiwww/archive/2006/07/07/57122.html
本帖主要贴几个关于lucene的工具类。
- 索引建立
package com.lpm.fanger.search.base;import java.io.File;import java.io.IOException;import java.text.NumberFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import org.apache.commons.beanutils.PropertyUtils;import org.apache.commons.lang.math.NumberUtils;import org.apache.commons.lang.time.DateFormatUtils;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.BooleanClause;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.Similarity;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.TopScoreDocCollector;import org.apache.lucene.search.BooleanClause.Occur;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import org.wltea.analyzer.lucene.IKSimilarity;/** * @Intro lucene索引工具类 * @author Lee * @Date 2013-8-22 */public class IndexUtils {private final static String Globle_Lucene_Path = "D:/lucene_index";private final static String KeyWord_Field_Name= "id";private final static IKAnalyzer Globle_Analyzer = new IKAnalyzer();private final static String FMT_DATE = "yyyyMMddHHmmssSSS";private final static NumberFormat FMT_ID = NumberFormat.getInstance();static {FMT_ID.setGroupingUsed(false);FMT_ID.setMaximumFractionDigits(0);FMT_ID.setMaximumIntegerDigits(12);FMT_ID.setMinimumIntegerDigits(12);}private IndexUtils(){}/** * 当前分词器 * @return */public final static Analyzer getAnalyzer(){return Globle_Analyzer;}/*********************CRUD************************//*********************CRUD************************//*********************CRUD************************//** * 添加索引(建立索引) * @param clazz 目标对象 * @param docs 目标对象详细集合 * @return 成功添加索引的条数 * @throws Exception */public static int add(Class<? extends LuceneEnable> clazz,List<? extends LuceneEnable> objs) throws Exception{if (objs == null || objs.size() == 0)return 0;IndexWriter writer = getWriter(clazz);try {int count = add(writer,objs);writer.optimize();return count;}finally{writer.close();writer = null;}}/** * 添加速印(建立索引) * @param doc 当前文档 * @throws Exception */public static void add(LuceneEnable doc) throws Exception{if(doc == null)return;IndexWriter writer = getWriter(doc.getClass());try{//再添加writer.addDocument(objectToDocment(doc));//提交事务writer.commit();}finally{writer.close();}}/** * 删除索引 * @param doc * @throws Exception */public static void delete(LuceneEnable doc) throws Exception{if(doc == null)return;IndexWriter writer = getWriter(doc.getClass());try{writer.deleteDocuments(new Term("id", String.valueOf(doc.getPrimeryKey())));writer.commit();}finally{writer.close();}}/** * 更新索引 * @param doc * @throws Exception */public static void update(LuceneEnable doc) throws Exception{if(doc == null)return;IndexWriter writer = getWriter(doc.getClass());try{//先删除writer.deleteDocuments(new Term("id", String.valueOf(doc.getPrimeryKey())));//再添加writer.addDocument(objectToDocment(doc));//提交事务writer.commit();}finally{writer.close();}}/**********查找**********//**********查找**********//** * 索引库中查找满足条件的主键结果集 * @param clazz * @param query * @param maxCount * @return 满足条件的主键结果集 * @throws Exception */public static List<Long> find(Class<? extends LuceneEnable> clazz,Query query,int maxCount) throws Exception{IndexSearcher reader = getReader(clazz);try{//获取查询结果TopDocs hits = reader.search(query, null,maxCount);if(hits == null)return null;List<Long> results = new ArrayList<Long>();//取得结果数int num = Math.min(hits.totalHits, maxCount);for(int i = 0; i < num ;i++){ScoreDoc scoreDoc = hits.scoreDocs[i];Document doc = reader.doc(scoreDoc.doc);Long primaryKey = NumberUtils.toLong(doc.get(KeyWord_Field_Name));if(primaryKey > 0 && !results.contains(primaryKey)){//满足条件值,加到结果集合results.add(primaryKey);}}return results;}finally{reader.close();}}/** * 索引库中查找满足条件的【对象】结果集 * @param clazz * @param query * @param maxCount * @return * @throws Exception */public static List<? extends LuceneEnable> findList(Class<? extends LuceneEnable> clazz,Query query,int maxCount) throws Exception{IndexSearcher reader = getReader(clazz);List results = new ArrayList();try{TopDocs hits = reader.search(query, null, maxCount);if(hits == null){return null;}//找最小集合长度int num = Math.min(hits.totalHits, maxCount);for(int i=0;i<num;i++){//循环找到对象集合ScoreDoc scoreDoc = hits.scoreDocs[i];Document doc = reader.doc(scoreDoc.doc);//实例化对象属性Object obj = documentToObject(clazz, doc);if(obj != null){results.add(obj);}}return results;}finally{reader.close();}}/** * 获取全文查询对象 * * 任意参数 * @param booleanClauses * @return */public static BooleanQuery getFullTextQuery(BooleanClause... booleanClauses){BooleanQuery booleanQuery = new BooleanQuery();for (BooleanClause booleanClause : booleanClauses){booleanQuery.add(booleanClause);}return booleanQuery;}/** * 获取全文查询对象 * @param q 查询关键字 * @param fields 查询字段(任意多) * @return 全文查询对象 */public static BooleanQuery getFullTextQuery(String q, String... fields){Analyzer analyzer = new IKAnalyzer();BooleanQuery query = new BooleanQuery();try {if (q != null && !q.equals("")){for (String field : fields){QueryParser parser = new QueryParser(Version.LUCENE_36, field, analyzer);query.add(parser.parse(q), Occur.SHOULD);}}} catch (ParseException e) {e.printStackTrace();}return query;}/************助手方法**************//************助手方法**************//************助手方法**************//** * 添加索引助手类 * @param indexWriter * @param docs * @return */protected static int add(IndexWriter writer,List<? extends LuceneEnable> objs) throws Exception{if(objs == null || objs.size() == 0){return 0;}int count = 0;for(LuceneEnable obj : objs){Document doc = objectToDocment(obj);doc.setBoost(obj.GetBoost());writer.addDocument(doc);count++;}return count;}/** * 获取索引写 * @param clazz * @return * @throws IOException */protected static IndexWriter getWriter(Class<?> clazz) throws IOException{String path = Globle_Lucene_Path + File.separator + clazz.getSimpleName();Directory indexDir = FSDirectory.open(new File(path));return new IndexWriter(indexDir,Globle_Analyzer,IndexWriter.MaxFieldLength.UNLIMITED);}/** * 获取索引读 * @param clazz * @return * @throws IOException */protected static IndexSearcher getReader(Class<?> clazz) throws IOException{String path = Globle_Lucene_Path + File.separator + clazz.getSimpleName();Directory indexDir = FSDirectory.open(new File(path));IndexSearcher reader = new IndexSearcher(indexDir);//使用ik的相似度评估器Similarity similarity = new IKSimilarity();reader.setSimilarity(similarity);return reader;}/** * Document转换成对象 * @param clazz * @param doc * @return * @throws Exception */private static Object documentToObject(Class<? extends LuceneEnable> clazz,Document doc) throws Exception{Object obj = clazz.newInstance();java.lang.reflect.Field[] fields = clazz.getDeclaredFields();for(java.lang.reflect.Field field : fields){String name = field.getName();String value = doc.get(name);if(name ==null || name.equals("") || value ==null || value.equals("") )continue;//进入一个字段setFieldValue(obj, name, value);//需要调试}return null;}/** * 对象转换成Documents * @param obj * @return * @throws Exception */private static Document objectToDocment(LuceneEnable obj) throws Exception{Document doc = new Document();//设置关键字域doc.add(keyWord(KeyWord_Field_Name, FMT_ID.format(obj.getPrimeryKey())));//设置索引域String[] indexFields = obj.GetIndexFields();if(indexFields != null && indexFields.length > 0){for(String indexField : indexFields){String value = getFieldValue(obj, indexField);if(value != null && !value.equals("")){doc.add(index(indexField, value));}}}//设置存储域String[] storeFields = obj.GetStoreFields();if(storeFields != null && storeFields.length > 0){for(String storeField : storeFields){String value = getFieldValue(obj, storeField);if(value != null && !value.equals("")){doc.add(keyWord(storeField, value));}}}//设置扩展索引值HashMap<String, String> extendIndex = obj.GetExtendIndexValues();if(extendIndex != null){for(String key : extendIndex.keySet()){String value = extendIndex.get(key);doc.add(index(key, value));}}//设置扩展值HashMap<String, String> extend = obj.GetExtendValues();if(extend != null){for(String key : extend.keySet()){String value = extend.get(key);doc.add(keyWord(key, value));}}return doc;}/** * 构造关键字域 * @param name * @param value * @return (关键字)域/字段 */private static final Field keyWord(String name,String value){return new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED);}/** * 构造索引域 * @param name * @param value * @return (索引)域/字段 */private static final Field index(String name,String value){return new Field(name, value, Field.Store.YES, Field.Index.ANALYZED);}/** * 获取对象属性值 * @param obj * @param fieldName * @return * @throws Exception 只支持属性类型为String/integer/double/float等基本类型 */private static String getFieldValue(Object obj,String fieldName) throws Exception{Object fieldValue = PropertyUtils.getProperty(obj, fieldName);if(fieldValue instanceof String)return (String)fieldValue;if(fieldValue instanceof Date)return DateFormatUtils.format((Date)fieldValue, FMT_DATE);return String.valueOf(fieldValue);}/** * 设置属性值 * @param obj * @param fieldName * @param fieldValue * @throws Exception 只支持属性类型为String/integer/double/float等基本类型 */private static void setFieldValue(Object obj,String fieldName,String fieldValue) throws Exception{PropertyUtils.setProperty(obj, fieldName, fieldValue);}}
- 查询
package com.lpm.fanger.search.base;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.BooleanClause;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.BooleanClause.Occur;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;/** * @Intro Lucene搜索工具类 * @author Lee * @Date 2013-8-24 */public class LuceneSearchUtils {/** * 获取全文查询对象 */public static BooleanQuery getFullTextQuery(BooleanClause... booleanClauses){BooleanQuery booleanQuery = new BooleanQuery();for (BooleanClause booleanClause : booleanClauses){booleanQuery.add(booleanClause);}return booleanQuery;}/** * 获取全文查询对象 * @param q 查询关键字 * @param fields 查询字段 * @return 全文查询对象 */public static BooleanQuery getFullTextQuery(String q, String... fields){Analyzer analyzer = new IKAnalyzer();BooleanQuery query = new BooleanQuery();try {if (q != null && !q.equals("")){for (String field : fields){QueryParser parser = new QueryParser(Version.LUCENE_36, field, analyzer);query.add(parser.parse(q), Occur.SHOULD);}}} catch (ParseException e) {e.printStackTrace();}return query;}}
- 使用
一般在项目中单独开一个端口,不断的更新索引。
/** * 构建索引 * @param objClass * @return */private static int _BuildIndexOfObject(Class<? extends LuceneEnable> objClass) throws Exception {int ic = 0;long last_id = 0L;do {List<? extends LuceneEnable> objs = dao.listAfter(last_id,BATCH_COUNT);if(objs != null && objs.size()>0){ic += IndexUtils.add(objClass, objs);last_id = objs.get(objs.size()-1).getPrimeryKey();}if(objs == null || objs.size() < BATCH_COUNT)break;}while(true);return ic;}
- 测试
private final static Log log = LogFactory.getLog(RebuildLuceneIndex.class);private final static int BATCH_COUNT = 500;//static BookDao dao;//static AticleDao dao;static ExampleDao dao;static{ApplicationContext app = new ClassPathXmlApplicationContext("spring.xml");//dao = app.getBean("bookDao", BookDao.class);//dao = app.getBean("aticleDao", AticleDao.class);dao = app.getBean("exampleDao", ExampleDao.class);}@SuppressWarnings({ "rawtypes", "unchecked" })public static void main(String[] args) throws Exception {String beanName = Example.class.getName();//Book.class.getName();//Aticle.class.getName();//Class beanClass = Class.forName(beanName);//Long t1 = System.currentTimeMillis();//int ic = _BuildIndexOfObject(beanClass);//log.info(ic + " documents of " + beanName + " created.");//System.out.println("TIME:"+(System.currentTimeMillis() - t1)+"ms");Long t2 = System.currentTimeMillis();Query query =// LuceneSearchUtils.getFullTextQuery("神奇校车", new String[]{"bookName"});//,"outline"}IKQueryParser.parseMultiField(new String[]{"title"}, "选择");//经过测试,这个方法比较好一点//LuceneSearchUtils.getFullTextQuery("java", new String[]{"book_name","out_line"});//IKQueryParser.parseMultiField(new String[]{"title","content"}, "c++");List<Long> list = IndexUtils.find(beanClass, query, 100);//LuceneIndexUtils.find(beanClass, query, 100);//List<Aticle> list = (List<Aticle>) LuceneIndexUtils.find(beanClass, query, 100, false);//List<Book> list = (List<Book>) LuceneIndexUtils.find(beanClass, query, 100, false);System.out.println(list.size());System.out.println("TIME:"+(System.currentTimeMillis() - t2)+"ms");System.exit(0);}
- 相关的bean
package com.lpm.fanger.search.base;import java.util.HashMap;import javax.persistence.GeneratedValue;import javax.persistence.GenerationType;import javax.persistence.Id;import javax.persistence.Table;/** * @Intro descrption here * @author Lee * @Date 2013-8-24 */@Table(name="t_article")public class Example implements LuceneEnable{private Integer id;private String title;private String content;private String tag;/************getter and setter**************/@Id@GeneratedValue(strategy=GenerationType.IDENTITY)public Integer getId() {return id;}public void setId(Integer id) {this.id = id;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getContent() {return content;}public void setContent(String content) {this.content = content;}public String getTag() {return tag;}public void setTag(String tag) {this.tag = tag;}/************override method**************/@Overridepublic Long getPrimeryKey() {return Long.valueOf(this.getId());}@Overridepublic String[] GetStoreFields() {return new String[]{"tag"};}@Overridepublic String[] GetIndexFields() {return new String[]{"title","content"};}@Overridepublic HashMap<String, String> GetExtendValues() {return null;}@Overridepublic HashMap<String, String> GetExtendIndexValues() {return null;}@Overridepublic float GetBoost() {return 0;}}
- 相关的接口(重要)
package com.lpm.fanger.search.base;import java.util.HashMap;import java.util.List;/** * @Intro 支持搜索lucene全文检索 * 功能的Bean类需要实现该接口 * @author Lee * @Date 2013-8-24 */public interface LuceneEnable {/** * 获取搜索对象的关键字, * 便于搜索得到分析后,得到记录的主键值, * 这样就可以通过查数据库表的方式,来得 * 到记录的完整情况 * @return */public Long getPrimeryKey();/** * 返回搜索对象需要存储的字段名,例如createTime, author等 * @return */public String[] GetStoreFields();/** * 返回搜索对象的索引字段,例如title,content * @return */public String[] GetIndexFields();/** * 返回对象的扩展信息 * @return */public HashMap<String, String> GetExtendValues();/** * 返回对象的扩展索引信息 * @return */public HashMap<String, String> GetExtendIndexValues();/** * 返回文档的权重 * @return */public float GetBoost();}
- 相关的dao
package com.lpm.fanger.jdbc.dao;import java.sql.ResultSet;import java.sql.SQLException;import java.util.ArrayList;import java.util.List;import org.springframework.jdbc.core.RowMapper;import org.springframework.stereotype.Repository;import com.lpm.fanger.jdbc.mysql.BaseDaoMysqlImpl;import com.lpm.fanger.search.base.Example;/** * @Intro db interface * @author Lee * @Date 2013-8-26 */@Repository("exampleDao")public class ExampleDao extends BaseDaoMysqlImpl<Example, Integer>{public ExampleDao(){super(Example.class);}public List<Example> listAfter(Long begain,Integer count){List<Object> values = new ArrayList<Object>();values.add(begain);values.add(count);String sql = "select * from "+getTableName()+" limit ?,?";List<Example> list = search(sql, values,new ExampleRowMappere());return list;}}class ExampleRowMappere implements RowMapper<Example>{@Overridepublic Example mapRow(ResultSet rs, int value) throws SQLException {Example ex = new Example();ex.setContent(rs.getString("content"));ex.setTitle(rs.getString("title"));ex.setTag(rs.getString("tag"));ex.setId(rs.getInt("id"));return ex;}}
2、 mysql + sphinx
这中技术架构,有很好的性能,主要的工作放到了插件sphinx
相关资料:包括原理,实例以及安装,查询语句的书写等等。
http://pan.baidu.com/share/link?shareid=152940799&uk=572544164
感谢书写这些文档的前辈以及大牛门。如有侵权,请您给我留言,我会把这个链接拿掉。