diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED index 44629aac36f7637717fd33a7ff4388ab2b60ddcf..7dd3ca7f7296d81a551d03874a85ceeb19836170 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED @@ -15,16 +15,13 @@ ExtractAllTermsReport.error.noOpenCase=No currently open case. ExtractAllTermsReport.export.error=Error During Unique Word Extraction ExtractAllTermsReport.exportComplete=Unique Word Extraction Complete ExtractAllTermsReport.getName.text=Extract Unique Words -# {0} - Number of extracted terms ExtractAllTermsReport.numberExtractedTerms=Extracted {0} terms... ExtractAllTermsReport.search.ingestInProgressBody=<html>Keyword Search Ingest is currently running.<br />Not all files have been indexed and unique word extraction might yield incomplete results.<br />Do you want to proceed with unique word extraction anyway?</html> -# {0} - Keyword search commit frequency ExtractAllTermsReport.search.noFilesInIdxMsg=No files are in index yet. Try again later. Index is updated every {0} minutes. ExtractAllTermsReport.search.noFilesInIdxMsg2=No files are in index yet. Try again later ExtractAllTermsReport.search.searchIngestInProgressTitle=Keyword Search Ingest in Progress ExtractAllTermsReport.startExport=Starting Unique Word Extraction ExtractedContentPanel.setMarkup.panelTxt=<span style='font-style:italic'>Loading text... Please wait</span> -# {0} - Content name ExtractedContentPanel.SetMarkup.progress.loading=Loading text for {0} GlobalEditListPanel.editKeyword.title=Edit Keyword GlobalEditListPanel.warning.text=Boundary characters ^ and $ do not match word boundaries. Consider\nreplacing with an explicit list of boundary characters, such as [ \\.,] @@ -228,7 +225,6 @@ KeywordSearchSettings.propertiesNSRL.text={0}_NSRL KeywordSearchSettings.propertiesScripts.text={0}_Scripts NoOpenCoreException.err.noOpenSorlCore.msg=No currently open Solr core. SearchRunner.query.exception.msg=Error performing query: -# {0} - colelction name Server.deleteCore.exception.msg=Failed to delete Solr colelction {0} Server.exceptionMessage.unableToBackupCollection=Unable to backup Solr collection Server.exceptionMessage.unableToCreateCollection=Unable to create Solr collection @@ -371,7 +367,6 @@ SolrSearchService.exceptionMessage.noCurrentSolrCore=IndexMetadata did not conta SolrSearchService.exceptionMessage.noIndexMetadata=Unable to create IndexMetaData from case directory: {0} # {0} - collection name SolrSearchService.exceptionMessage.unableToDeleteCollection=Unable to delete collection {0} -SolrSearchService.indexingError=Unable to index blackboard artifact. SolrSearchService.ServiceName=Solr Keyword Search Service SolrSearchService.DeleteDataSource.msg=Error Deleting Solr data for data source id {0} DropdownSingleTermSearchPanel.dataSourceCheckBox.text=Restrict search to the selected data sources: diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 2deb82d2f5bb7cbacf7a0e4db68c3cb9dcef9930..6489c90bf060e05e1e6744f5970f91505f11ab55 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -420,7 +420,7 @@ public String toString() { * * @return The content of the chunk. */ - public String geLowerCasedChunk() { + public String getLowerCasedChunk() { return lowerCasedChunk.toString(); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 994dfb9161e828cac501a40287b46144a08e0616..1f7efa122d3d18dcf8adf0ea513c88b28848e8fb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -212,6 +212,7 @@ private < T extends SleuthkitVisitableItem> boolean indexTextAndSearch(Reader so //Get a reader for the content of the given source try (BufferedReader reader = new BufferedReader(sourceReader)) { Chunker chunker = new Chunker(reader); + searcher.searchChunk(sourceName, sourceID); while (chunker.hasNext()) { if (context != null && context.fileIngestIsCancelled()) { @@ -237,7 +238,7 @@ private < T extends SleuthkitVisitableItem> boolean indexTextAndSearch(Reader so language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang)); try { //add the chunk text to Solr index - indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields); + indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields); // add mini chunk when there's a language specific field if (chunker.hasNext() && language.isPresent()) { languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get()); @@ -252,7 +253,7 @@ private < T extends SleuthkitVisitableItem> boolean indexTextAndSearch(Reader so } if(keywordListNames != null) { - searcher.searchChunk(chunk); + searcher.searchChunk(chunk, sourceID); } } if (chunker.hasException()) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java index 6dbd6c83fe3d139e20cea3d9a5340a66034dbbbf..6353dd6c39f9892c93fce686f93b95586cc67dd4 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/InlineSearcher.java @@ -32,6 +32,11 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.Query; import org.openide.util.Exceptions; import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; @@ -65,7 +70,25 @@ final class InlineSearcher { } } - void searchChunk(Chunk chunk) throws TskCoreException { + /** + * Search the chunk for the currently selected keywords. + * + * @param chunk + * @param sourceID + * @throws TskCoreException + */ + void searchChunk(Chunk chunk, long sourceID) throws TskCoreException { + searchString(chunk.getLowerCasedChunk(), sourceID); + } + + /** + * Search a string for the currently selected keywords. + * + * @param text + * @param sourceID + * @throws TskCoreException + */ + void searchString(String text, long sourceID) throws TskCoreException { for (KeywordList list : keywordList) { List<Keyword> keywords = list.getKeywords(); for (Keyword originalKeyword : keywords) { @@ -77,34 +100,20 @@ void searchChunk(Chunk chunk) throws TskCoreException { List<KeywordHit> keywordHits = new ArrayList<>(); if (originalKeyword.searchTermIsLiteral()) { -// if (!originalKeyword.searchTermIsWholeWord()) { - if (StringUtil.containsIgnoreCase(chunk.geLowerCasedChunk(), originalKeyword.getSearchTerm())) { + if (StringUtil.containsIgnoreCase(text, originalKeyword.getSearchTerm())) { - keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); - } -// } else { -// String REGEX_FIND_WORD="\\b\\W*%s\\W*\\b"; //"[\\w[\\.']]*%s[\\w[\\.']]*"; //"(?i).*?\\b%s\\b.*?"; -// String regex=String.format(REGEX_FIND_WORD, Pattern.quote(originalKeyword.getSearchTerm().toLowerCase())); -// if(chunk.geLowerCasedChunk().matches(regex)) { -// keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); -// } - -// Pattern pattern = Pattern.compile(regex, java.util.regex.Pattern.CASE_INSENSITIVE); -// Matcher matcher = pattern.matcher(chunk.geLowerCasedChunk()); -// if (matcher.find()) { -// keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); -// } -// } + keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID)); + } } else { String regex = originalKeyword.getSearchTerm(); try { // validate the regex Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(chunk.geLowerCasedChunk()); + Matcher matcher = pattern.matcher(text); if (matcher.find()) { - keywordHits.addAll(createKeywordHits(chunk, originalKeyword)); + keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID)); } } catch (IllegalArgumentException ex) { //TODO What should we do here? Log and continue? @@ -132,15 +141,25 @@ void searchChunk(Chunk chunk) throws TskCoreException { } /** - * This method very similar to RegexQuery createKeywordHits, with the knowledge - * of solr removed. - * - * @param chunk + * This method very similar to RegexQuery createKeywordHits, with the + * knowledge of solr removed. + * + * @param text * @param originalKeyword - * @return - * @throws TskCoreException + * + * @return A list of KeywordHit objects. + * + * @throws TskCoreException */ - private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) throws TskCoreException { + private List<KeywordHit> createKeywordHits(String text, Keyword originalKeyword, long sourceID) throws TskCoreException { + + if (originalKeyword.searchTermIsLiteral() && originalKeyword.searchTermIsWholeWord()) { + try { + return getExactMatchHits(text, originalKeyword, sourceID); + } catch (IOException ex) { + throw new TskCoreException("Failed to create exactMatch hits", ex); + } + } final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>(); @@ -164,13 +183,8 @@ private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) * and possessives (e.g. hacker's). This obviously works for English * but is probably not sufficient for other languages. */ - if(!originalKeyword.searchTermIsWholeWord()) { - searchPattern = "[\\w[\\.']]*" + java.util.regex.Pattern.quote(keywordString.toLowerCase()) + "[\\w[\\.']]*"; - } else { - String REGEX_FIND_WORD="\\b\\W*%s\\W*\\b"; - searchPattern=String.format(REGEX_FIND_WORD, Pattern.quote(originalKeyword.getSearchTerm().toLowerCase())); - testingTokenizer(chunk, originalKeyword); - } + searchPattern = "[\\w[\\.']]*" + java.util.regex.Pattern.quote(keywordString.toLowerCase()) + "[\\w[\\.']]*"; + } else { searchPattern = keywordString; } @@ -178,7 +192,7 @@ private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) final java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(searchPattern, java.util.regex.Pattern.CASE_INSENSITIVE); try { - String content = chunk.geLowerCasedChunk(); + String content = text; Matcher hitMatcher = pattern.matcher(content); int offset = 0; @@ -261,7 +275,7 @@ private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) keywordsFoundInThisDocument.put(hit, hit); if (artifactAttributeType == null) { - hits.add(new KeywordHit(0, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); + hits.add(new KeywordHit(0, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); } else { switch (artifactAttributeType) { case TSK_EMAIL: @@ -272,7 +286,7 @@ private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) */ if (hit.length() >= MIN_EMAIL_ADDR_LENGTH && DomainValidator.getInstance(true).isValidTld(hit.substring(hit.lastIndexOf('.')))) { - hits.add(new KeywordHit(0, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); + hits.add(new KeywordHit(0, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); } break; @@ -289,14 +303,14 @@ private List<KeywordHit> createKeywordHits(Chunk chunk, Keyword originalKeyword) if (ccnMatcher.find()) { final String group = ccnMatcher.group("ccn"); if (CreditCardValidator.isValidCCN(group)) { - hits.add(new KeywordHit(0, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); + hits.add(new KeywordHit(0, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); } } } break; default: - hits.add(new KeywordHit(0, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); + hits.add(new KeywordHit(0, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit)); break; } } @@ -360,28 +374,74 @@ void makeArtifacts(Content content, IngestJobContext context, long sourceID) { map.clear(); } } - - private void testingTokenizer(Chunk chunk, Keyword originalKeyword) { - try { - List<String> tokens = analyze(chunk.geLowerCasedChunk(), new StandardAnalyzer()); - for(String token: tokens) { - if(token.equals(originalKeyword.getSearchTerm())) { - + + /** + * Searches the chunk for exact matches and creates the approprate keyword + * hits. + * + * @param text + * @param originalKeyword + * @param sourceID + * + * @return + * + * @throws IOException + */ + public List<KeywordHit> getExactMatchHits(String text, Keyword originalKeyword, long sourceID) throws IOException { + final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>(); + + List<KeywordHit> hits = new ArrayList<>(); + Analyzer analyzer = new StandardAnalyzer(); + + //Get the tokens of the keyword + List<String> keywordTokens = new ArrayList<>(); + try (TokenStream keywordstream = analyzer.tokenStream("field", originalKeyword.getSearchTerm())) { + CharTermAttribute attr = keywordstream.addAttribute(CharTermAttribute.class); + keywordstream.reset(); + while (keywordstream.incrementToken()) { + keywordTokens.add(attr.toString()); + } + } + + try (TokenStream stream = analyzer.tokenStream("field", text)) { + CharTermAttribute attr = stream.addAttribute(CharTermAttribute.class); + OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); + stream.reset(); + while (stream.incrementToken()) { + if (!attr.toString().equals(keywordTokens.get(0))) { + continue; + } + + int startOffset = offset.startOffset(); + int endOffset = offset.endOffset(); + boolean match = true; + + for (int index = 1; index < keywordTokens.size(); index++) { + if (stream.incrementToken()) { + if (!attr.toString().equals(keywordTokens.get(index))) { + match = false; + break; + } else { + endOffset = offset.endOffset(); + } + } + } + + if (match) { + String hit = text.subSequence(startOffset, endOffset).toString(); + + // We will only create one KeywordHit instance per document for + // a given hit. + if (keywordsFoundInThisDocument.containsKey(hit)) { + continue; + } + keywordsFoundInThisDocument.put(hit, hit); + + hits.add(new KeywordHit(0, sourceID, KeywordSearchUtil.makeSnippet(text, startOffset, endOffset, hit), hit)); } } - } catch (IOException ex) { - Exceptions.printStackTrace(ex); } - } - - public List<String> analyze(String text, Analyzer analyzer) throws IOException{ - List<String> result = new ArrayList<>(); - TokenStream tokenStream = analyzer.tokenStream("sampleName", text); - CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); - tokenStream.reset(); - while(tokenStream.incrementToken()) { - result.add(attr.toString()); - } - return result; + + return hits; } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java index 88b2c988fd3bbdcb3c53ecb891ea4826f17d82fb..fab5caeaad1836dd8be442586303661bed97d167 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java @@ -86,11 +86,11 @@ class KeywordHit implements Comparable<KeywordHit> { } } - KeywordHit(int chunkId, String snippet, String hit) { + KeywordHit(int chunkId, long sourceID, String snippet, String hit) { this.snippet = StringUtils.stripToEmpty(snippet); this.hit = hit; this.chunkId = chunkId; - this.solrObjectId = 0; + this.solrObjectId = sourceID; } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchUtil.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchUtil.java index e5f39885a5416876735536e1c8d8e6481a169789..8cab5236ec7c33277d1e47398d95bc98d95189e4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchUtil.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchUtil.java @@ -169,13 +169,19 @@ static KeywordSearchQuery getQueryForKeyword(Keyword keyword, KeywordList keywor */ static String makeSnippet(String content, Matcher hitMatcher, String hit) { // Get the snippet from the document. - int maxIndex = content.length() - 1; final int end = hitMatcher.end(); final int start = hitMatcher.start(); - return content.substring(Integer.max(0, start - 20), Integer.max(0, start)) + return makeSnippet(content, start, end, hit); + } + + static String makeSnippet(String content, int startOffset, int endOffset, String hit) { + // Get the snippet from the document. + int maxIndex = content.length() - 1; + + return content.substring(Integer.max(0, startOffset - 20), Integer.max(0, startOffset)) + SNIPPET_DELIMITER + hit + SNIPPET_DELIMITER - + content.substring(Integer.min(maxIndex, end), Integer.min(maxIndex, end + 20)); + + content.substring(Integer.min(maxIndex, endOffset), Integer.min(maxIndex, endOffset + 20)); } /** diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java index 3d830489cca7384daa5063608a33f8e023e184fe..e5fd3abdc3bc2986cb5e35456d20f61781d23f1e 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java @@ -575,7 +575,9 @@ public static BlackboardArtifact createKeywordHitArtifact(Content content, Keyw Collection<BlackboardAttribute> attributes = new ArrayList<>(); attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm())); - attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, originalKW.getSearchTerm())); + if(!originalKW.searchTermIsWholeWord()) { + attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, originalKW.getSearchTerm())); + } if (StringUtils.isNotBlank(listName)) { attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName)); diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/ExtractRegistry.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/ExtractRegistry.java index 4dfcdd2193fc35b68785b9e634436c4a08081438..a88ad8cad85dd6feb52ccbdcd0cae974ef506a1e 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/ExtractRegistry.java +++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/ExtractRegistry.java @@ -420,15 +420,6 @@ private void analyzeRegistryFiles(long ingestJobId) { Report report = currentCase.addReport(regOutputFiles.fullPlugins, NbBundle.getMessage(this.getClass(), "ExtractRegistry.parentModuleName.noSpace"), "RegRipper " + regFile.getUniquePath(), regFile); //NON-NLS - - // Index the report content so that it will be available for keyword search. -// KeywordSearchService searchService = Lookup.getDefault().lookup(KeywordSearchService.class); -// if (null == searchService) { -// logger.log(Level.WARNING, "Keyword search service not found. Report will not be indexed"); -// } else { -// searchService.index(report); -// report.close(); -// } } catch (TskCoreException e) { this.addErrorMessage("Error adding regripper output as Autopsy report: " + e.getLocalizedMessage()); //NON-NLS }