Lucene 代码高亮显示
代码高亮指的是将我们搜索的关键字按照给定格式进行回显,例如 红色粗体
添加依赖
<!--代码高亮-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>8.0.0</version>
</dependency>
创建索引
package com.et.lucene04;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Paths;
/**
* @Author: ETJAVA
* @CreateTime: 2024-04-06 23:21
* @Description: TODO
* @Version: 1.0
*/
public class Test {
public static void main(String[] args) throws IOException {
Integer[] ids = {1,2,3};
String[] citys={"广州","深圳","青岛"};
String[] descs = {
"广州市,简称“穗”,别称羊城、花城、五羊城,是广东省辖地级市、广东省省会、副省级市、国家中心城市、超大城市 [292]、广州都市圈核心城市,国务院批复确定的中国重要的中心城市、国际商贸中心和综合交通枢纽 [1],世界一线城市 [294]。截至2023年10月,广州市下辖11个区,总面积7434.40平方千米,市人民政府驻越秀区府前路1号 [2] [86]。截至2023年末,广州市常住人口1882.70万人,城镇化率为86.76%。",
"深圳是个年轻的城市",
"青岛是个美丽的城市"
};
// 封装索引存放的目录
FSDirectory dir = FSDirectory.open(Paths.get("D://lucene2"));
// 使用中文分词器 SmartChineseAnalyzer
Analyzer analyzer = new SmartChineseAnalyzer();
// 创建写索引的配置对象 关联分词器
IndexWriterConfig config = new IndexWriterConfig(analyzer);
// 创建写索引对象
IndexWriter writer = new IndexWriter(dir,config);
for (int i = 0; i < ids.length; i++) {
Document doc = new Document();
doc.add(new TextField("id",ids[i]+"", Field.Store.YES));
doc.add(new StringField("citys",citys[i], Field.Store.YES));
doc.add(new TextField("descs",descs[i], Field.Store.YES));
writer.addDocument(doc);
}
System.out.println("共写了 "+writer.numRamDocs()+" 个文档的索引");
writer.close();
}
}
读取索引
package com.et.lucene04;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.FSDirectory;
import java.io.StringReader;
import java.nio.file.Paths;
/**
* @Author: ETJAVA
* @CreateTime: 2024-04-06 23:23
* @Description: TODO
* @Version: 1.0
*/
public class CNHighlighter {
public static void main(String[] args) throws Exception {
// 封装索引所在目录
FSDirectory dir = FSDirectory.open(Paths.get("D://lucene2"));
// 创建DirectoryReader对象 读取索引数据
DirectoryReader reader = DirectoryReader.open(dir);
// 创建查询对象
IndexSearcher indexSearcher = new IndexSearcher(reader);
// 创建分词器
SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
// 创建解析器 用来封装分词
QueryParser parser = new QueryParser("descs",smartChineseAnalyzer);
// 封装要查询的内容
Query query = parser.parse("广州羊城");
// 执行查询
TopDocs topDocs = indexSearcher.search(query, 10);
// 代码高亮
// 对查询进行得分计算
QueryScorer scorer = new QueryScorer(query);
// 封装得分高的片段
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
// 设置HTML格式 默认粗体
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
// 代码高亮
Highlighter highlighter = new Highlighter(formatter, scorer);
// 设置要显示的片段
highlighter.setTextFragmenter(fragmenter);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
// 获取Document对象 scoreDoc.doc相当于主键ID
Document document = indexSearcher.doc(scoreDoc.doc);
System.out.println("id:"+document.get("id"));
System.out.println("city:"+document.get("citys"));
String desc = document.get("descs");
if(desc!=null){
// 显示的得分高的desc字段
// 将desc字段解析成流数据
TokenStream tokenStream = smartChineseAnalyzer.tokenStream("desc",new StringReader(desc));
// 获取得分最高的片段
String summary = highlighter.getBestFragment(tokenStream, desc);
System.out.println(summary);
}
}
reader.close();
}
}
