纵有疾风起
人生不言弃

Lucene笔记一

Lucene就是一个全文检索的工具,建立索引用的,类似于新华字典的目录

这里使用的是lucene-4.4.0版本,入门代码所需jar包如下图所示(解压lucene-4.4.0后的目录):

Lucene笔记一插图

入门代码:

import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.IntField;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexableField;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.junit.Test;/*8 * luceneDemo *  */public class TestLucene {    /**     * 通过lucene 提供的api 对数据建立索引,indexWriter     * @throws IOException      *      */    @Test    public void testAdd() throws IOException{                //索引在硬盘上面存放的位置..        Directory directory=FSDirectory.open(new File("D:/INDEX"));        //lucene 当前使用的版本...        Version matchVersion=Version.LUCENE_44;        //分词器...(把一段文本分词)(黑马程序员是高端的培训机构)        //analzyer 是一个抽象类,具体的切分词规则由子类实现...        Analyzer analyzer=new StandardAnalyzer(matchVersion);                IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);                //构造索引写入的对象..        IndexWriter indexWriter=new IndexWriter(directory, config);                //往索引库里面写数据..        //索引库里面的数据都是document 一个document相当于是一条记录        //这个document里面的数据相当于索引结构..        Document document=new Document();        IndexableField indexableField=new IntField("id",1, Store.YES);        IndexableField stringfield=new StringField("title","对王召廷的个人评价",Store.YES);        IndexableField teIndexableField=new TextField("content","风流倜傥有点黄",Store.YES);        document.add(indexableField);        document.add(stringfield);        document.add(teIndexableField);        //索引库里面接收的数据都是document对象        indexWriter.addDocument(document);        indexWriter.close();    }        /**     * 对建立的索引进行搜索...     * 通过indexSearcher 去搜索...     * @throws IOException      */    @Test    public void testSearcher() throws IOException{                //索引在硬盘上面存放的位置..        Directory directory=FSDirectory.open(new File("D:/INDEX"));        //把索引目录里面的索引读取到IndexReader 当中...        IndexReader indexReader=DirectoryReader.open(directory);//        /构造搜索索引的对象..        IndexSearcher indexSearcher=new IndexSearcher(indexReader);                //Query 它是一个查询条件对象,它是一个抽象类,不同的查询规则就构造不同的子类...        Query query=new TermQuery(new Term("title", "对王召廷的个人评价"));                //检索符合query 条件的前面N 条记录..        //        TopDocs topDocs=indexSearcher.search(query, 10);        //返回总记录数...        System.out.println(topDocs.totalHits);                //存放的都是document 的id        ScoreDoc scoreDocs []=topDocs.scoreDocs;                for(ScoreDoc scoreDoc:scoreDocs){            //返回的就是document id            int docID=scoreDoc.doc;            //我还需要根据id 检索到对应的document            Document document=indexSearcher.doc(docID);                        System.out.println("id=="+document.get("id"));            System.out.println("title=="+document.get("title"));            System.out.println("content=="+document.get("content"));                    }     }    }

原理分析图:

Lucene笔记一插图(1)

demo演示: 

根据入门代码流程提炼工具类代码:

import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;/** * lucene 工具类... * @author Administrator * *//** * 提炼规则,假设这段代码可以完成一个功能,把这个代码提炼到一个方法里面去,假设这个方法在某个业务罗继承可以共用,那么往上抽取, * 假设在其它逻辑层也可以用,提炼到工具类里面去。 *  */public class LuceneUtils {    private static IndexWriter indexWriter=null;    private static IndexSearcher indexSearcher=null;            //索引存放目录..    private static Directory directory=null;        private static IndexWriterConfig indexWriterConfig=null;        private static Version version=null;            private static Analyzer analyzer=null;        static {        try {            directory=FSDirectory.open(new File(Constants.URL));            version=Version.LUCENE_44;            analyzer=new StandardAnalyzer(version);            indexWriterConfig=new IndexWriterConfig(version, analyzer);        } catch (IOException e) {            e.printStackTrace();        }    }    /**     *      * @return 返回用于操作索引的对象...     * @throws IOException     */    public static IndexWriter getIndexWriter() throws IOException{        indexWriter=new IndexWriter(directory, indexWriterConfig);        return indexWriter;    }    /**     * 返回用于搜索索引的对象...     * @return     * @throws IOException      */    public static IndexSearcher  getIndexSearcher() throws IOException{                IndexReader indexReader=DirectoryReader.open(directory);        indexSearcher=new IndexSearcher(indexReader);                return indexSearcher;    }    /**     *      * 返回lucene 当前的版本...     * @return     */    public static Version getVersion() {        return version;    }    /**     *      * 返回lucene 当前使用的分词器..     * @return     */    public static Analyzer getAnalyzer() {        return analyzer;    }    }
public class Constants {    /**     * 索引存放的目录     */    public static final String URL="d:/indexdir/news";}

bean:

package cn.itcast.bean;public class Article {    private int id;        public int getId() {        return id;    }    public void setId(int id) {        this.id = id;    }    public String getTitle() {        return title;    }    public void setTitle(String title) {        this.title = title;    }    public String getContent() {        return content;    }    public void setContent(String content) {        this.content = content;    }    public String getAuthor() {        return author;    }    public void setAuthor(String author) {        this.author = author;    }    public String getUrl() {        return url;    }    public void setUrl(String url) {        this.url = url;    }    private String title;        private String content;        private String author;        private String url;    }

转换工具类:

package cn.itcast.lucene;import org.apache.lucene.document.Document;import org.apache.lucene.document.IntField;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexableField;import cn.itcast.bean.Article;/*8 * 对象与索引库document 之间的转化 *  */public class ArticleToDocument {            public static Document articleToDocument(Article article){        Document document=new Document();        IntField idfield=new IntField("id", article.getId(), Store.YES);        //StringField 对应的值不分词,textField 分词..        TextField titleField=new TextField("title", article.getTitle(),Store.YES);        TextField contentField=new TextField("content", article.getContent(),Store.YES);        //修改这个字段对应的权重值,默认这个值为1f//        contentField.setBoost(3f);        StringField authorField=new StringField("author", article.getAuthor(), Store.YES);        StringField urlField=new StringField("url", article.getUrl(), Store.YES);        document.add(idfield);        document.add(titleField);        document.add(contentField);        document.add(authorField);        document.add(urlField);        return document;    }}

Dao层:

package cn.itcast.dao;import java.io.IOException;import org.apache.lucene.document.Document;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import cn.itcast.bean.Article;import cn.itcast.lucene.ArticleToDocument;import cn.itcast.uitls.LuceneUtils;/** * 使用lucene 的API 来操作索引库.. * @author Administrator * */public class LuceneDao {        public void addIndex(Article article) throws IOException{        IndexWriter indexWriter=LuceneUtils.getIndexWriter();        Document doc=ArticleToDocument.articleToDocument(article);        indexWriter.addDocument(doc);        indexWriter.close();    }        /**     * 删除符合条件的记录...     * @param fieldName     * @param fieldValue     * @throws IOException     */    public void delIndex(String fieldName,String fieldValue) throws IOException{        IndexWriter indexWriter=LuceneUtils.getIndexWriter();                //一定要梦想,万一实现了勒        Term term=new Term(fieldName, fieldValue);                indexWriter.deleteDocuments(term);                indexWriter.close();    }    /**     *      * 更新     *      * update table set ?  where condtion     * @throws IOException      *      *      */    public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{        IndexWriter indexWriter=LuceneUtils.getIndexWriter();        /**         * 1:term 设置更新的条件...         *          * 2:设置更新的内容的对象..         *          */        Term term=new Term(fieldName,fieldValue);        Document doc=ArticleToDocument.articleToDocument(article);        /**         *          * 在lucene 里面是先删除符合这个条件term 的记录,在创建一个doc 记录...         *          */        indexWriter.updateDocument(term, doc);        indexWriter.close();    }    /**     * 0,10     * 10,10     * 20,10     * @param keywords     * @throws Exception     */    public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{                IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();        //第一个条件.. 单字段查询...//        Query query=new TermQuery(new Term("title","梦想"))                //select *  from  table where fieldname="" or content=""                String fields []={"title","content"};                //第二种条件:使用查询解析器,多字段。。。 我们需要重新导入一个jar queryParser 的jar... 位置在lucene解压后的queryparser文件夹下        QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());        //        /这个事一个条件..        Query query=queryParser.parse(keywords);                        //query 它是一个查询条件,query 是一个抽象类,不同的查询规则构造部同的子类即可        //检索符合query 条件的前面N 条记录...        //检索的是索引目录... (总记录数,socreDOC (docID))        //使用lucene 提供的api 进行操作...        TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult);//        /存放的是docID        ScoreDoc scoreDocs []=topDocs.scoreDocs;        //判断:scoreDocs 的length  (实际取出来的数量..) 与 firstResult+maxResult 的值取小值...                //在java jdk 里面提供了一个api        int endResult=Math.min(scoreDocs.length, firstResult+maxResult);                        for(int i=firstResult;i<endResult;i++){//            /取出来的是docID,这个id 是lucene 自己来维护。            int docID=scoreDocs[i].doc;            Document document=indexSearcher.doc(docID);            System.out.println("id==="+document.get("id"));            System.out.println("title==="+document.get("title"));            System.out.println("content==="+document.get("content"));            System.out.println("url==="+document.get("url"));            System.out.println("author==="+document.get("author"));        }            }}

测试类:

package cn.itcast.junit;import java.io.IOException;import org.junit.Test;import cn.itcast.bean.Article;import cn.itcast.dao.LuceneDao;/** * 测试luceneDao * @author Administrator * */public class LuceneDaoTest {        private LuceneDao luceneDao=new LuceneDao();        @Test    public void testCreate() throws IOException{        for(int i=28;i<=28;i++){            Article article=new Article();            article.setId(i);            article.setTitle("一定要梦想,万一实现了勒");            article.setContent("矫情我觉得这句话太矫情了矫情矫情矫情矫情矫情矫情");            article.setUrl("http://www.tianmao.com");            article.setAuthor("马云");            luceneDao.addIndex(article);        }                    }    @Test    public void testsearcher() throws Exception{//        article.setTitle("一定要梦想,万一实现了勒");   textfield   分词     标准分词器      //        article.setContent("我觉得这句话太矫情了");   textfield   分词    标准分词器        luceneDao.findIndex("梦想",20,10);            }    @Test    public void testdelete() throws IOException{        String fieldName="title";        String fieldValue="定";        luceneDao.delIndex(fieldName, fieldValue);    }    @Test    public void testUpdate() throws IOException{        String fieldName="title";        String fieldValue="定";                Article article=new Article();        article.setId(9527);        article.setTitle("一定要梦想,万一实现了勒");        article.setContent("我觉得这句话太矫情了");        article.setUrl("http://www.tianmao.com");        article.setAuthor("马云");                luceneDao.updateIndex(fieldName, fieldValue, article);            }        }

 分词器的流程图:

Lucene笔记一插图(2)

 关于分词器,网上可以找到很多种类的分词器配合Lucene使用,相关分词规则查看对应说明。

举例如下:

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文单字切分、英文按空格切分成单词

Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分词,中文相连的两个词作为一个索引

Analyzer analyzer=new IKAnalyzer();//第三方的分词器,对中文支持较好,可以自定义分词单词与停用词

 

索引库优化

package cn.itcast.lucene;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.LogDocMergePolicy;import org.apache.lucene.index.MergePolicy;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.IOContext;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import cn.itcast.uitls.Constants;public class TestOptimise {    /*8     * 优化的第一种方式:通过 IndexWriterConfig 优化设置mergePolicy(合并策略)     *      *      */    public void testoptimise() throws IOException{        Directory directory=FSDirectory.open(new File(Constants.URL));                Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);        IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);                LogDocMergePolicy mergePolicy=new LogDocMergePolicy();                /**         * 当这个值越小,更少的内存会被运用当创建索引的时候,搜索的时候越快,创建的时候越慢。         * 当这个值越大,更多的内存会被运用当创建索引的时候,搜索的时候越慢,创建的时候越快..         * larger values >10         *          * 2<=smaller<=10         *          */        //设置合并因子..        mergePolicy.setMergeFactor(10);//        /设置索引的合并策略..        config.setMergePolicy(mergePolicy);        IndexWriter indexWriter=new IndexWriter(directory, config);    }        /**     * 通过directory 去优化....     * @throws IOException      *      */    @Test    public void testoptimise2() throws IOException{        //现在的索引放在硬盘上面...        Directory directory=FSDirectory.open(new File(Constants.URL));//        /通过这个对象吧directory 里面的数据读取到directory1 里面来..        IOContext ioContext=new IOContext();        //相办法吧directory 的索引读取到内存当中来...        Directory directory1=new RAMDirectory(directory,ioContext);        IndexReader indexReader=DirectoryReader.open(directory1);        IndexSearcher indexSearcher=new IndexSearcher(indexReader);        Query query=new TermQuery(new Term("title", "想"));        TopDocs topDocs=indexSearcher.search(query, 100);        System.out.println(topDocs.totalHits);    }        /**     * 索引文件越大,会影响检索的速度..  (减少索引文件的大小)     *      * 1:排除停用词..     *      */    public void testoptimise3(){                    }    /**     * 将索引分目盘存放  将数据归类...     *      */    public void testoptimise4(){                    }}

 

文章转载于:https://www.cnblogs.com/lm970585581/p/9410322.html

原著是一个有趣的人,若有侵权,请通知删除

未经允许不得转载:起风网 » Lucene笔记一

分享到: 生成海报
avatar

评论 抢沙发

评论前必须登录!

立即登录   注册

切换注册

登录

忘记密码 ?

切换登录

注册

我们将发送一封验证邮件至你的邮箱, 请正确填写以完成账号注册和激活