From b03f0ebe6bf2822bb003d941eda247010f1923fd Mon Sep 17 00:00:00 2001 From: M66B Date: Wed, 9 Aug 2023 18:54:37 +0200 Subject: [PATCH] Experiment: search attachment content --- ATTRIBUTION.md | 1 + app/src/main/assets/ATTRIBUTION.md | 1 + .../elephantbird/util/StreamSearcher.java | 87 +++++++++++++++++++ .../java/eu/faircode/email/ActivityView.java | 1 + .../java/eu/faircode/email/AdapterFolder.java | 2 + .../email/BoundaryCallbackMessages.java | 41 +++++++++ .../java/eu/faircode/email/CharsetHelper.java | 43 ++++++--- .../faircode/email/FragmentDialogSearch.java | 6 +- app/src/main/res/layout/dialog_search.xml | 36 ++++++-- app/src/main/res/values/strings.xml | 2 + 10 files changed, 199 insertions(+), 21 deletions(-) create mode 100644 app/src/main/java/com/twitter/elephantbird/util/StreamSearcher.java diff --git a/ATTRIBUTION.md b/ATTRIBUTION.md index 975f75893e..21f71df1d0 100644 --- a/ATTRIBUTION.md +++ b/ATTRIBUTION.md @@ -47,3 +47,4 @@ FairEmail uses: * [MaterialDings](https://github.com/Accusoft/MaterialDings). Copyright (c) 2018 Accusoft Corporation. [MIT License](https://github.com/Accusoft/MaterialDings/blob/master/LICENSE.md). * [Send](https://github.com/timvisee/send). [Mozilla Public License 2.0](https://github.com/timvisee/send/blob/master/LICENSE). * [DetectHtml](https://github.com/dbennett455/DetectHtml). [The MIT License](https://github.com/dbennett455/DetectHtml/blob/master/LICENSE). +* [Elephant Bird](https://github.com/twitter/elephant-bird). [Apache License Version 2.0](https://github.com/twitter/elephant-bird/blob/master/LICENSE). diff --git a/app/src/main/assets/ATTRIBUTION.md b/app/src/main/assets/ATTRIBUTION.md index 975f75893e..21f71df1d0 100644 --- a/app/src/main/assets/ATTRIBUTION.md +++ b/app/src/main/assets/ATTRIBUTION.md @@ -47,3 +47,4 @@ FairEmail uses: * [MaterialDings](https://github.com/Accusoft/MaterialDings). Copyright (c) 2018 Accusoft Corporation. [MIT License](https://github.com/Accusoft/MaterialDings/blob/master/LICENSE.md). * [Send](https://github.com/timvisee/send). [Mozilla Public License 2.0](https://github.com/timvisee/send/blob/master/LICENSE). * [DetectHtml](https://github.com/dbennett455/DetectHtml). [The MIT License](https://github.com/dbennett455/DetectHtml/blob/master/LICENSE). +* [Elephant Bird](https://github.com/twitter/elephant-bird). [Apache License Version 2.0](https://github.com/twitter/elephant-bird/blob/master/LICENSE). diff --git a/app/src/main/java/com/twitter/elephantbird/util/StreamSearcher.java b/app/src/main/java/com/twitter/elephantbird/util/StreamSearcher.java new file mode 100644 index 0000000000..a914aba425 --- /dev/null +++ b/app/src/main/java/com/twitter/elephantbird/util/StreamSearcher.java @@ -0,0 +1,87 @@ +package com.twitter.elephantbird.util; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +/** + * An efficient stream searching class based on the Knuth-Morris-Pratt algorithm. + * For more on the algorithm works see: https://www.inf.fh-flensburg.de/lang/algorithmen/pattern/kmpen.htm. + */ +public class StreamSearcher { + + protected byte[] pattern_; + protected int[] borders_; + + // An upper bound on pattern length for searching. Results are undefined for longer patterns. + public static final int MAX_PATTERN_LENGTH = 1024; + + public StreamSearcher(byte[] pattern) { + setPattern(pattern); + } + + /** + * Sets a new pattern for this StreamSearcher to use. + * @param pattern + * the pattern the StreamSearcher will look for in future calls to search(...) + */ + public void setPattern(byte[] pattern) { + pattern_ = Arrays.copyOf(pattern, pattern.length); + borders_ = new int[pattern_.length + 1]; + preProcess(); + } + + /** + * Searches for the next occurrence of the pattern in the stream, starting from the current stream position. Note + * that the position of the stream is changed. If a match is found, the stream points to the end of the match -- i.e. the + * byte AFTER the pattern. Else, the stream is entirely consumed. The latter is because InputStream semantics make it difficult to have + * another reasonable default, i.e. leave the stream unchanged. + * + * @return bytes consumed if found, -1 otherwise. + * @throws IOException + */ + public long search(InputStream stream) throws IOException { + long bytesRead = 0; + + int b; + int j = 0; + + while ((b = stream.read()) != -1) { + bytesRead++; + + while (j >= 0 && (byte)b != pattern_[j]) { + j = borders_[j]; + } + // Move to the next character in the pattern. + ++j; + + // If we've matched up to the full pattern length, we found it. Return, + // which will automatically save our position in the InputStream at the point immediately + // following the pattern match. + if (j == pattern_.length) { + return bytesRead; + } + } + + // No dice, Note that the stream is now completely consumed. + return -1; + } + + /** + * Builds up a table of longest "borders" for each prefix of the pattern to find. This table is stored internally + * and aids in implementation of the Knuth-Moore-Pratt string search. + *

+ * For more information, see: https://www.inf.fh-flensburg.de/lang/algorithmen/pattern/kmpen.htm. + */ + protected void preProcess() { + int i = 0; + int j = -1; + borders_[i] = j; + while (i < pattern_.length) { + while (j >= 0 && pattern_[i] != pattern_[j]) { + j = borders_[j]; + } + borders_[++i] = ++j; + } + } +} diff --git a/app/src/main/java/eu/faircode/email/ActivityView.java b/app/src/main/java/eu/faircode/email/ActivityView.java index fb4bf2fb61..fca4b71358 100644 --- a/app/src/main/java/eu/faircode/email/ActivityView.java +++ b/app/src/main/java/eu/faircode/email/ActivityView.java @@ -2427,6 +2427,7 @@ public class ActivityView extends ActivityBilling implements FragmentManager.OnB criteria.in_subject = false; criteria.in_keywords = false; criteria.in_message = false; + criteria.in_attachments = false; criteria.in_notes = false; } diff --git a/app/src/main/java/eu/faircode/email/AdapterFolder.java b/app/src/main/java/eu/faircode/email/AdapterFolder.java index 20fb340d84..d8ecff579c 100644 --- a/app/src/main/java/eu/faircode/email/AdapterFolder.java +++ b/app/src/main/java/eu/faircode/email/AdapterFolder.java @@ -525,6 +525,7 @@ public class AdapterFolder extends RecyclerView.Adapter attachments = db.attachment().getAttachments(message.id); + if (attachments != null) + for (EntityAttachment attachment : attachments) { + File file = attachment.getFile(context); + if (file.exists() && file.length() > 0) { + byte[] sample = new byte[(int) Math.min(4096, file.length())]; + try (InputStream is = new FileInputStream(file)) { + Helper.readBuffer(is, sample); + } + + Charset detected = CharsetHelper.detect(sample, null); + if (detected == null) + detected = StandardCharsets.ISO_8859_1; + + Log.i("Searching for " + criteria.query + + " as " + detected + + " in " + file.getName() + ":" + file.length()); + try (InputStream is = new FileInputStream(file)) { + StreamSearcher searcher = new StreamSearcher(criteria.query.getBytes(detected)); + if (searcher.search(is) > 0) + return true; + } + } + } + } catch (Throwable ex) { + Log.e(ex); + } + return false; } @@ -987,6 +1023,7 @@ public class BoundaryCallbackMessages extends PagedList.BoundaryCallback + + + + + app:layout_constraintTop_toBottomOf="@id/tvSearchAttachmentsHint" /> + app:layout_constraintTop_toBottomOf="@id/cbWithAttachments" /> diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml index a2bdcf7e36..c40fc89220 100644 --- a/app/src/main/res/values/strings.xml +++ b/app/src/main/res/values/strings.xml @@ -1763,6 +1763,7 @@ Searching via the search index is fast, but only finds whole words. Searching for text in messages, when there are a large number of messages, might not work on some servers \'%s\' means that the mail server doesn\'t support searching in message texts + This will be slow when there are many or large attachments Searching for messages by size, when there are a large number of messages, might not work on some servers More options Use search index @@ -1771,6 +1772,7 @@ In subject In keywords (if supported) In message text + In attachments (on device only) In local notes In headers In HTML