From 5c57326178a5837c92d3d545a012e13168ef018c Mon Sep 17 00:00:00 2001 From: Kelly Kelly <kelly@basistech.com> Date: Tue, 25 Oct 2022 12:58:12 -0400 Subject: [PATCH] stashing --- KeywordSearch/ivy.xml | 3 +- KeywordSearch/nbproject/project.properties | 1 + KeywordSearch/nbproject/project.xml | 4 ++ .../autopsy/keywordsearch/InlineSearcher.java | 51 +++++++++++++++---- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/KeywordSearch/ivy.xml b/KeywordSearch/ivy.xml index 7b417a99c7..df0be3b330 100644 --- a/KeywordSearch/ivy.xml +++ b/KeywordSearch/ivy.xml @@ -18,7 +18,8 @@ <dependency conf="solr-war->default" org="org.apache.solr" name="solr" rev="4.10.4" transitive="false" /> <!-- the war file for embedded Solr 4 --> <dependency conf="solr-libs->default" name="solr-cell" rev="8.11.2" org="org.apache.solr"/> - + <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core --> + <!-- <dependency org="org.apache.lucene" name="lucene-core" rev="8.11.2"/> --> <!-- Autopsy --> <dependency conf="autopsy->default" org="org.apache.solr" name="solr-solrj" rev="8.11.2"/> <dependency conf="autopsy->default" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/> diff --git a/KeywordSearch/nbproject/project.properties b/KeywordSearch/nbproject/project.properties index d639073ca7..4046543d12 100644 --- a/KeywordSearch/nbproject/project.properties +++ b/KeywordSearch/nbproject/project.properties @@ -44,6 +44,7 @@ file.reference.stax2-api-4.2.1.jar=release/modules/ext/stax2-api-4.2.1.jar file.reference.woodstox-core-6.2.4.jar=release/modules/ext/woodstox-core-6.2.4.jar file.reference.zookeeper-3.8.0.jar=release/modules/ext/zookeeper-3.8.0.jar file.reference.zookeeper-jute-3.8.0.jar=release/modules/ext/zookeeper-jute-3.8.0.jar +file.reference.lucene-core-8.11.2.jar=release/modules/ext/lucene-core-8.11.2.jar javac.source=1.8 javac.compilerargs=-Xlint -Xlint:-serial license.file=../LICENSE-2.0.txt diff --git a/KeywordSearch/nbproject/project.xml b/KeywordSearch/nbproject/project.xml index 9b8fa50bda..c5777d8a14 100644 --- a/KeywordSearch/nbproject/project.xml +++ b/KeywordSearch/nbproject/project.xml @@ -418,6 +418,10 @@ <runtime-relative-path>ext/zookeeper-jute-3.8.0.jar</runtime-relative-path> <binary-origin>release/modules/ext/zookeeper-jute-3.8.0.jar</binary-origin> </class-path-extension> + <class-path-extension> + <runtime-relative-path>ext/lucene-core-8.11.2.jar</runtime-relative-path> + <binary-origin>release/modules/ext/lucene-core-8.11.2.jar</binary-origin> + </class-path-extension> </data> </configuration> </project> diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java index 9cfef44812..6dbd6c83fe 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java @@ -19,6 +19,7 @@ package org.sleuthkit.autopsy.keywordsearch; import com.twelvemonkeys.lang.StringUtil; +import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -27,6 +28,11 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.validator.routines.DomainValidator; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.openide.util.Exceptions; import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.coreutils.Logger; @@ -71,24 +77,24 @@ void searchChunk(Chunk chunk) throws TskCoreException { List<KeywordHit> keywordHits = new ArrayList<>(); if (originalKeyword.searchTermIsLiteral()) { - if (!originalKeyword.searchTermIsWholeWord()) { +// if (!originalKeyword.searchTermIsWholeWord()) { if (StringUtil.containsIgnoreCase(chunk.geLowerCasedChunk(), originalKeyword.getSearchTerm())) { keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); } - } else { - String REGEX_FIND_WORD="\\b\\W*%s\\W*\\b"; //"[\\w[\\.']]*%s[\\w[\\.']]*"; //"(?i).*?\\b%s\\b.*?"; - String regex=String.format(REGEX_FIND_WORD, Pattern.quote(originalKeyword.getSearchTerm().toLowerCase())); +// } else { +// String REGEX_FIND_WORD="\\b\\W*%s\\W*\\b"; //"[\\w[\\.']]*%s[\\w[\\.']]*"; //"(?i).*?\\b%s\\b.*?"; +// String regex=String.format(REGEX_FIND_WORD, Pattern.quote(originalKeyword.getSearchTerm().toLowerCase())); // if(chunk.geLowerCasedChunk().matches(regex)) { // keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); // } - Pattern pattern = Pattern.compile(regex, java.util.regex.Pattern.CASE_INSENSITIVE); - Matcher matcher = pattern.matcher(chunk.geLowerCasedChunk()); - if (matcher.find()) { - keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); - } - } +// Pattern pattern = Pattern.compile(regex, java.util.regex.Pattern.CASE_INSENSITIVE); +// Matcher matcher = pattern.matcher(chunk.geLowerCasedChunk()); +// if (matcher.find()) { +// keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); +// } +// } } else { String regex = originalKeyword.getSearchTerm(); @@ -163,6 +169,7 @@ private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) } else { String REGEX_FIND_WORD="\\b\\W*%s\\W*\\b"; searchPattern=String.format(REGEX_FIND_WORD, Pattern.quote(originalKeyword.getSearchTerm().toLowerCase())); + testingTokenizer(chunk, originalKeyword); } } else { searchPattern = keywordString; @@ -353,4 +360,28 @@ void makeArtifacts(Content content, IngestJobContext context, long sourceID) { map.clear(); } } + + private void testingTokenizer(Chunk chunk, Keyword originalKeyword) { + try { + List<String> tokens = analyze(chunk.geLowerCasedChunk(), new StandardAnalyzer()); + for(String token: tokens) { + if(token.equals(originalKeyword.getSearchTerm())) { + + } + } + } catch (IOException ex) { + Exceptions.printStackTrace(ex); + } + } + + public List<String> analyze(String text, Analyzer analyzer) throws IOException{ + List<String> result = new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream("sampleName", text); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while(tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } } -- GitLab