Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
signalapp
GitHub Repository: signalapp/Signal-iOS
Path: blob/main/SignalServiceKit/Search/FullTextSearchIndexer.swift
1 views
//
// Copyright 2018 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
//

import GRDB

public enum FullTextSearchIndexer {
    public static let matchTag = "match"

    // MARK: - Normalization

    private static var charactersToRemove: CharacterSet = {
        // * We want to strip punctuation - and our definition of "punctuation"
        //   is broader than `CharacterSet.punctuationCharacters`.
        // * FTS should be robust to (i.e. ignore) illegal and control characters,
        //   but it's safer if we filter them ourselves as well.
        var charactersToFilter = CharacterSet.punctuationCharacters
        charactersToFilter.formUnion(CharacterSet.illegalCharacters)
        charactersToFilter.formUnion(CharacterSet.controlCharacters)

        // We want to strip all ASCII characters except:
        // * Letters a-z, A-Z
        // * Numerals 0-9
        // * Whitespace
        var asciiToFilter = CharacterSet(charactersIn: UnicodeScalar(0x0)!..<UnicodeScalar(0x80)!)
        assert(!asciiToFilter.contains(UnicodeScalar(0x80)!))
        asciiToFilter.subtract(CharacterSet.alphanumerics)
        asciiToFilter.subtract(CharacterSet.whitespacesAndNewlines)
        charactersToFilter.formUnion(asciiToFilter)

        return charactersToFilter
    }()

    // This is a hot method, especially while running large migrations.
    // Changes to it should go through a profiler to make sure large migrations
    // aren't adversely affected.
    public static func normalizeText(_ text: String) -> String {
        // 1. Filter out invalid characters.
        let filtered = text.removeCharacters(characterSet: charactersToRemove)

        // 2. Simplify whitespace.
        let simplified = filtered.replaceCharacters(
            characterSet: .whitespacesAndNewlines,
            replacement: " ",
        )

        // 3. Strip leading & trailing whitespace last, since we may replace
        // filtered characters with whitespace.
        let trimmed = simplified.trimmingCharacters(in: .whitespacesAndNewlines)

        // 4. Use canonical mapping.
        //
        // From the GRDB docs:
        //
        // Generally speaking, matches may fail when content and query don’t use
        // the same unicode normalization. SQLite actually exhibits inconsistent
        // behavior in this regard.
        //
        // For example, for aimé to match aimé, they better have the same
        // normalization: the NFC aim\u{00E9} form may not match its NFD aime\u{0301}
        // equivalent. Most strings that you get from Swift, UIKit and Cocoa use NFC,
        // so be careful with NFD inputs (such as strings from the HFS+ file system,
        // or strings that you can’t trust like network inputs). Use
        // String.precomposedStringWithCanonicalMapping to turn a string into NFC.
        //
        // Besides, if you want fi to match the ligature fi (U+FB01), then you need
        // to normalize your indexed contents and inputs to NFKC or NFKD. Use
        // String.precomposedStringWithCompatibilityMapping to turn a string into NFKC.
        let canonical = trimmed.precomposedStringWithCanonicalMapping

        return canonical
    }

    // MARK: - Querying

    // We want to match by prefix for "search as you type" functionality.
    // SQLite does not support suffix or contains matches.
    public static func buildQuery(for searchText: String) -> String {
        // 1. Normalize the search text.
        //
        // TODO: We could arguably convert to lowercase since the search
        // is case-insensitive.
        let normalizedSearchText = normalizeText(searchText)

        // 2. Split the non-numeric text into query terms (or tokens).
        let nonNumericText = String(String.UnicodeScalarView(normalizedSearchText.unicodeScalars.lazy.map {
            if CharacterSet.decimalDigits.contains($0) {
                return " "
            } else {
                return $0
            }
        }))
        var queryTerms = nonNumericText.split(separator: " ")

        // 3. Add an additional numeric-only query term.
        let digitsOnlyScalars = normalizedSearchText.unicodeScalars.lazy.filter {
            CharacterSet.decimalDigits.contains($0)
        }
        let digitsOnly: Substring = Substring(String(String.UnicodeScalarView(digitsOnlyScalars)))
        queryTerms.append(digitsOnly)

        // 4. De-duplicate and sort query terms.
        //    Duplicate terms are redundant.
        //    Sorting terms makes the output of this method deterministic and easier to test,
        //        and the order won't affect the search results.
        queryTerms = Array(Set(queryTerms)).sorted()

        // 5. Filter the query terms.
        let filteredQueryTerms = queryTerms.filter {
            // Ignore empty terms.
            $0.count > 0
        }.map {
            // Allow partial match of each term.
            //
            // Note that we use double-quotes to enclose each search term.
            // Quoted search terms can include a few more characters than
            // "bareword" (non-quoted) search terms.  This shouldn't matter,
            // since we're filtering all of the affected characters, but
            // quoting protects us from any bugs in that logic.
            "\"\($0)\"*"
        }

        // 6. Join terms into query string.
        let query = filteredQueryTerms.joined(separator: " ")
        return query
    }
}

// MARK: - Message Index

// See: http://groue.github.io/GRDB.swift/docs/4.1/index.html#full-text-search
// See: https://www.sqlite.org/fts5.html

extension FullTextSearchIndexer {
    public static let ftsTableName = "indexable_text_fts"

    static let contentTableName = "indexable_text"
    static let uniqueIdColumn = "uniqueId"
    static let collectionColumn = "collection"
    static let ftsContentColumn = "ftsIndexableContent"

    private static let legacyCollectionName = "TSInteraction"

    private static func indexableContent(for message: TSMessage, tx: DBReadTransaction) -> String? {
        guard !message.isViewOnceMessage else {
            // Don't index "view-once messages".
            return nil
        }
        guard !message.isGroupStoryReply else {
            return nil
        }
        guard message.editState != .pastRevision else {
            return nil
        }
        guard let bodyText = message.rawBody(transaction: tx) else {
            return nil
        }
        return normalizeText(bodyText)
    }

    public static func insert(
        _ message: TSMessage,
        tx: DBWriteTransaction,
    ) {
        guard let ftsContent = indexableContent(for: message, tx: tx) else {
            return
        }
        executeUpdate(
            sql: """
            INSERT INTO \(contentTableName)
            (\(collectionColumn), \(uniqueIdColumn), \(ftsContentColumn))
            VALUES
            (?, ?, ?)
            """,
            arguments: [legacyCollectionName, message.uniqueId, ftsContent],
            tx: tx,
        )
    }

    public static func update(
        _ message: TSMessage,
        tx: DBWriteTransaction,
    ) {
        delete(message, tx: tx)
        insert(message, tx: tx)
    }

    public static func delete(
        _ message: TSMessage,
        tx: DBWriteTransaction,
    ) {
        executeUpdate(
            sql: """
            DELETE FROM \(contentTableName)
            WHERE \(uniqueIdColumn) == ?
            AND \(collectionColumn) == ?
            """,
            arguments: [message.uniqueId, legacyCollectionName],
            tx: tx,
        )
    }

    private static func executeUpdate(
        sql: String,
        arguments: StatementArguments,
        tx: DBWriteTransaction,
    ) {
        do {
            // Worth using a cached statement here, as we may be indexing many
            // things at once.
            try tx.database.executeWithCachedStatementThrows(
                sql: sql,
                arguments: arguments,
            )
        } catch {
            // We intentionally don't use failIfThrows here because we know the
            // FTS index relatively frequently reports corruption errors; for
            // these specifically swallow them rather than flagging the entire
            // database as corrupted.
            //
            // TODO: Implement "rebuild the FTS index" in response to it becoming corrupted.
            owsFailDebug("Failed to perform FTS index operation! \(error.grdbErrorForLogging)")
        }
    }

    // MARK: - Querying

    public static func search(
        for searchText: String,
        maxResults: Int,
        tx: DBReadTransaction,
        block: (_ message: TSMessage, _ snippet: String, _ stop: inout Bool) -> Void,
    ) {
        let query = buildQuery(for: searchText)

        if query.isEmpty {
            // FullTextSearchFinder.query filters some characters, so query
            // may now be empty.
            Logger.warn("Empty query.")
            return
        }

        // Search with the query interface or SQL
        do {
            // GRDB TODO: We could use bm25() instead of rank to order results.
            let indexOfContentColumnInFTSTable = 0
            // Determines the length of the snippet.
            let numTokens: UInt = 15
            let matchSnippet = "match_snippet"
            let sql: String = """
            SELECT
                \(contentTableName).\(collectionColumn),
                \(contentTableName).\(uniqueIdColumn),
                SNIPPET(\(ftsTableName), \(indexOfContentColumnInFTSTable), '<\(matchTag)>', '</\(matchTag)>', '…', \(numTokens)) AS \(matchSnippet)
            FROM \(ftsTableName)
            LEFT JOIN \(contentTableName) ON \(contentTableName).rowId = \(ftsTableName).rowId
            WHERE \(ftsTableName).\(ftsContentColumn) MATCH ?
            ORDER BY rank
            LIMIT \(maxResults)
            """

            let cursor = try Row.fetchCursor(tx.database, sql: sql, arguments: [query])
            while let row = try cursor.next() {
                let collection: String = row[collectionColumn]
                guard collection == legacyCollectionName else {
                    owsFailDebug("Found something other than a message in the FTS table")
                    continue
                }
                guard let uniqueId = (row[uniqueIdColumn] as String).nilIfEmpty else {
                    owsFailDebug("Found a message with a uniqueId in the FTS table")
                    continue
                }
                let snippet: String = row[matchSnippet]
                guard let message = TSMessage.fetchMessageViaCache(uniqueId: uniqueId, transaction: tx) else {
                    owsFailDebug("Couldn't find message that exists in the FTS table")
                    continue
                }
                var stop = false
                block(message, snippet, &stop)
                if stop {
                    break
                }
            }
        } catch {
            owsFailDebug("Couldn't fetch results: \(error.grdbErrorForLogging)")
        }
    }
}