Path: blob/main/SignalServiceKit/Search/FullTextSearchIndexer.swift
1 views
//
// Copyright 2018 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
//
import GRDB
public enum FullTextSearchIndexer {
public static let matchTag = "match"
// MARK: - Normalization
private static var charactersToRemove: CharacterSet = {
// * We want to strip punctuation - and our definition of "punctuation"
// is broader than `CharacterSet.punctuationCharacters`.
// * FTS should be robust to (i.e. ignore) illegal and control characters,
// but it's safer if we filter them ourselves as well.
var charactersToFilter = CharacterSet.punctuationCharacters
charactersToFilter.formUnion(CharacterSet.illegalCharacters)
charactersToFilter.formUnion(CharacterSet.controlCharacters)
// We want to strip all ASCII characters except:
// * Letters a-z, A-Z
// * Numerals 0-9
// * Whitespace
var asciiToFilter = CharacterSet(charactersIn: UnicodeScalar(0x0)!..<UnicodeScalar(0x80)!)
assert(!asciiToFilter.contains(UnicodeScalar(0x80)!))
asciiToFilter.subtract(CharacterSet.alphanumerics)
asciiToFilter.subtract(CharacterSet.whitespacesAndNewlines)
charactersToFilter.formUnion(asciiToFilter)
return charactersToFilter
}()
// This is a hot method, especially while running large migrations.
// Changes to it should go through a profiler to make sure large migrations
// aren't adversely affected.
public static func normalizeText(_ text: String) -> String {
// 1. Filter out invalid characters.
let filtered = text.removeCharacters(characterSet: charactersToRemove)
// 2. Simplify whitespace.
let simplified = filtered.replaceCharacters(
characterSet: .whitespacesAndNewlines,
replacement: " ",
)
// 3. Strip leading & trailing whitespace last, since we may replace
// filtered characters with whitespace.
let trimmed = simplified.trimmingCharacters(in: .whitespacesAndNewlines)
// 4. Use canonical mapping.
//
// From the GRDB docs:
//
// Generally speaking, matches may fail when content and query don’t use
// the same unicode normalization. SQLite actually exhibits inconsistent
// behavior in this regard.
//
// For example, for aimé to match aimé, they better have the same
// normalization: the NFC aim\u{00E9} form may not match its NFD aime\u{0301}
// equivalent. Most strings that you get from Swift, UIKit and Cocoa use NFC,
// so be careful with NFD inputs (such as strings from the HFS+ file system,
// or strings that you can’t trust like network inputs). Use
// String.precomposedStringWithCanonicalMapping to turn a string into NFC.
//
// Besides, if you want fi to match the ligature fi (U+FB01), then you need
// to normalize your indexed contents and inputs to NFKC or NFKD. Use
// String.precomposedStringWithCompatibilityMapping to turn a string into NFKC.
let canonical = trimmed.precomposedStringWithCanonicalMapping
return canonical
}
// MARK: - Querying
// We want to match by prefix for "search as you type" functionality.
// SQLite does not support suffix or contains matches.
public static func buildQuery(for searchText: String) -> String {
// 1. Normalize the search text.
//
// TODO: We could arguably convert to lowercase since the search
// is case-insensitive.
let normalizedSearchText = normalizeText(searchText)
// 2. Split the non-numeric text into query terms (or tokens).
let nonNumericText = String(String.UnicodeScalarView(normalizedSearchText.unicodeScalars.lazy.map {
if CharacterSet.decimalDigits.contains($0) {
return " "
} else {
return $0
}
}))
var queryTerms = nonNumericText.split(separator: " ")
// 3. Add an additional numeric-only query term.
let digitsOnlyScalars = normalizedSearchText.unicodeScalars.lazy.filter {
CharacterSet.decimalDigits.contains($0)
}
let digitsOnly: Substring = Substring(String(String.UnicodeScalarView(digitsOnlyScalars)))
queryTerms.append(digitsOnly)
// 4. De-duplicate and sort query terms.
// Duplicate terms are redundant.
// Sorting terms makes the output of this method deterministic and easier to test,
// and the order won't affect the search results.
queryTerms = Array(Set(queryTerms)).sorted()
// 5. Filter the query terms.
let filteredQueryTerms = queryTerms.filter {
// Ignore empty terms.
$0.count > 0
}.map {
// Allow partial match of each term.
//
// Note that we use double-quotes to enclose each search term.
// Quoted search terms can include a few more characters than
// "bareword" (non-quoted) search terms. This shouldn't matter,
// since we're filtering all of the affected characters, but
// quoting protects us from any bugs in that logic.
"\"\($0)\"*"
}
// 6. Join terms into query string.
let query = filteredQueryTerms.joined(separator: " ")
return query
}
}
// MARK: - Message Index
// See: http://groue.github.io/GRDB.swift/docs/4.1/index.html#full-text-search
// See: https://www.sqlite.org/fts5.html
extension FullTextSearchIndexer {
public static let ftsTableName = "indexable_text_fts"
static let contentTableName = "indexable_text"
static let uniqueIdColumn = "uniqueId"
static let collectionColumn = "collection"
static let ftsContentColumn = "ftsIndexableContent"
private static let legacyCollectionName = "TSInteraction"
private static func indexableContent(for message: TSMessage, tx: DBReadTransaction) -> String? {
guard !message.isViewOnceMessage else {
// Don't index "view-once messages".
return nil
}
guard !message.isGroupStoryReply else {
return nil
}
guard message.editState != .pastRevision else {
return nil
}
guard let bodyText = message.rawBody(transaction: tx) else {
return nil
}
return normalizeText(bodyText)
}
public static func insert(
_ message: TSMessage,
tx: DBWriteTransaction,
) {
guard let ftsContent = indexableContent(for: message, tx: tx) else {
return
}
executeUpdate(
sql: """
INSERT INTO \(contentTableName)
(\(collectionColumn), \(uniqueIdColumn), \(ftsContentColumn))
VALUES
(?, ?, ?)
""",
arguments: [legacyCollectionName, message.uniqueId, ftsContent],
tx: tx,
)
}
public static func update(
_ message: TSMessage,
tx: DBWriteTransaction,
) {
delete(message, tx: tx)
insert(message, tx: tx)
}
public static func delete(
_ message: TSMessage,
tx: DBWriteTransaction,
) {
executeUpdate(
sql: """
DELETE FROM \(contentTableName)
WHERE \(uniqueIdColumn) == ?
AND \(collectionColumn) == ?
""",
arguments: [message.uniqueId, legacyCollectionName],
tx: tx,
)
}
private static func executeUpdate(
sql: String,
arguments: StatementArguments,
tx: DBWriteTransaction,
) {
do {
// Worth using a cached statement here, as we may be indexing many
// things at once.
try tx.database.executeWithCachedStatementThrows(
sql: sql,
arguments: arguments,
)
} catch {
// We intentionally don't use failIfThrows here because we know the
// FTS index relatively frequently reports corruption errors; for
// these specifically swallow them rather than flagging the entire
// database as corrupted.
//
// TODO: Implement "rebuild the FTS index" in response to it becoming corrupted.
owsFailDebug("Failed to perform FTS index operation! \(error.grdbErrorForLogging)")
}
}
// MARK: - Querying
public static func search(
for searchText: String,
maxResults: Int,
tx: DBReadTransaction,
block: (_ message: TSMessage, _ snippet: String, _ stop: inout Bool) -> Void,
) {
let query = buildQuery(for: searchText)
if query.isEmpty {
// FullTextSearchFinder.query filters some characters, so query
// may now be empty.
Logger.warn("Empty query.")
return
}
// Search with the query interface or SQL
do {
// GRDB TODO: We could use bm25() instead of rank to order results.
let indexOfContentColumnInFTSTable = 0
// Determines the length of the snippet.
let numTokens: UInt = 15
let matchSnippet = "match_snippet"
let sql: String = """
SELECT
\(contentTableName).\(collectionColumn),
\(contentTableName).\(uniqueIdColumn),
SNIPPET(\(ftsTableName), \(indexOfContentColumnInFTSTable), '<\(matchTag)>', '</\(matchTag)>', '…', \(numTokens)) AS \(matchSnippet)
FROM \(ftsTableName)
LEFT JOIN \(contentTableName) ON \(contentTableName).rowId = \(ftsTableName).rowId
WHERE \(ftsTableName).\(ftsContentColumn) MATCH ?
ORDER BY rank
LIMIT \(maxResults)
"""
let cursor = try Row.fetchCursor(tx.database, sql: sql, arguments: [query])
while let row = try cursor.next() {
let collection: String = row[collectionColumn]
guard collection == legacyCollectionName else {
owsFailDebug("Found something other than a message in the FTS table")
continue
}
guard let uniqueId = (row[uniqueIdColumn] as String).nilIfEmpty else {
owsFailDebug("Found a message with a uniqueId in the FTS table")
continue
}
let snippet: String = row[matchSnippet]
guard let message = TSMessage.fetchMessageViaCache(uniqueId: uniqueId, transaction: tx) else {
owsFailDebug("Couldn't find message that exists in the FTS table")
continue
}
var stop = false
block(message, snippet, &stop)
if stop {
break
}
}
} catch {
owsFailDebug("Couldn't fetch results: \(error.grdbErrorForLogging)")
}
}
}