CoCalc -- model

GitHub Repository: drgnfrts/singapore-locations-ner
Path: blob/main/streamlit/model_demo.py
1
# This is the script to run the Streamlit mini-app.
2

3
from pickle import TRUE
4
import streamlit as st
5
import spacy
6
from spacy import displacy, load
7
import re
8
import csv
9

10

11
# Page title and icon for the browser bar
12
st.set_page_config(
13
    page_title="NER for SG Locations",
14
    page_icon="🇸🇬",
15
)
16

17

18
# Makes the app width the full length of the screen, unless it exceeds 1400px.
19
def _max_width_():
20
    max_width_str = f"max-width: 1400px;"
21
    st.markdown(
22
        f"""
23
    <style>
24
    .main .block-container{{
25
        {max_width_str}
26
    }}
27
    </style>
28
    """,
29
        unsafe_allow_html=True,
30
    )
31

32

33
# Call _max_width_() function
34
_max_width_()
35

36

37
# Function to load models, and cache them to ensure no reloading of the model occurs every time we try to re-run the whole app. The models are stored in a dictionary to enable easy access.
38
@st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
39
def load_models():
40
    standard_model = spacy.load("en_core_web_md")
41
    er_model = spacy.load("./models/model_v2.1")
42
    doccano_model = spacy.load("./models/model_v3.0/model-best")
43
    v31_model = spacy.load("./models/model_v3.1/model-best")
44
    models = {"std": standard_model, "erl": er_model,
45
              "dcn": doccano_model, "v31": v31_model}
46
    return models
47

48

49
models = load_models()
50

51

52
# Pull the list of locations.
53
abbreviation_dictionary = []
54
with open("./data/extracted_locations/sg_abbreviations.csv", "r") as csv_file:
55
    csvtest = csv.reader(csv_file, delimiter=",")
56
    for row in csvtest:
57
        abbreviation_dictionary.append(row)
58

59

60
# Function below utilises the list to lengthen the abbreviations
61
def lengthen_abbreviations(text):
62
    split = re.findall(r"[\w']+|[.,!?;&] | |-", text)
63
    i = 0
64
    for word in split:
65
        for row in abbreviation_dictionary:
66
            check_column = 0
67
            while check_column < 4:
68
                if word == "":
69
                    split[i] = ''
70
                elif word == row[check_column]:
71
                    split[i] = row[3]
72
                check_column += 1
73
        i += 1
74
    cleaned_text = ''.join(split)
75
    return cleaned_text
76

77

78
### ACTUAL START OF APP CONTENTS ###
79
st.title("Named Entity Recogniser for Singapore Locations 📍")
80
st.write("Compare the Standard English NLP Model with the Trained SG Location Names Model.")
81

82

83
# Function to clear the inputs in the form by clearing the session state of the input box. The key for the text input box here is "1".
84
def clear_form():
85
    st.session_state[1] = ""
86

87

88
# Function to enable selection of any number of the NLP models
89
def select_models(all_models_selected):
90
    if all_models_selected == True:
91
        selected_models = container.multiselect("Choose one or more models to analyse text with:", [
92
            'Standard Model', 'Dictionary Model', 'NER Model 3.0', 'NER Model 3.1'], ['Standard Model', 'Dictionary Model', 'NER Model 3.0', 'NER Model 3.1'])
93
    else:
94
        selected_models = container.multiselect("Choose one or more models to analyse text with:", [
95
            'Standard Model', 'Dictionary Model', 'NER Model 3.0', 'NER Model 3.1'])
96
    return selected_models
97

98

99
# Function to find the entities in text, depending on choice of model and whether abbreviations need to be lengthened
100
def find_ents(model, input, abr_lengthen):
101
    if abr_lengthen == True:
102
        doc = model(lengthen_abbreviations(input))
103
    else:
104
        doc = model(input)
105
    ent_html = displacy.render(doc, style="ent", jupyter=False)
106
    st.markdown(ent_html, unsafe_allow_html=True)
107
    st.write("")
108

109

110
# Dictionary for def display_models() to reference
111
model_choice = {"Standard Model": ("Pre-trained Standard English Model 💂", models["std"]),
112
                "Dictionary Model": ("Dictionary-centric Model for SG Locations 📖", models["erl"]),
113
                "NER Model 3.0": ("Enhanced NER-centric Model 3.0 for SG Locations 🦁", models["dcn"]),
114
                "NER Model 3.1": ("Enhanced NER-centric Model 3.1 for SG Locations 🆕", models["v31"])
115
                }
116

117

118
# Function to display the models and the text analysed with def find_ents()
119
def display_models(selected_models, text_input, abbreviation_status):
120
    for selected_model in selected_models:
121
        st.header(model_choice[selected_model][0])
122
        find_ents(model_choice[selected_model][1],
123
                  text_input, abbreviation_status)
124

125

126
# The actual form with inputs for model type, lengthening abbreviations and text to be analysed.
127
with st.form("NER_form"):
128
    # First item is a container that will display models to be used
129
    container = st.container()
130
    # Below the container is the checkboxes to enable selection of all models and abbreviations lengthening
131
    c_all_model_selection, c_abbreviate_selection, c_selection_last = st.columns([
132
        1, 1, 3])
133
    with c_all_model_selection:
134
        all_models_selected = st.checkbox("Select all models")
135
    with c_abbreviate_selection:
136
        abbreviation_status = st.checkbox("Lengthen abbreviations")
137
    # Multiselect option for the container is below
138
    selected_models = select_models(all_models_selected)
139
    # Clear the text input the first time around
140
    text_input = st.empty()
141
    # Text input is a text area box with the key "1". The key is to allow reference for def clear_form() to clear the text input box by resetting the session state.
142
    input = text_input.text_area('Text to analyze:', key=1)
143
    # Buttons to find the locations and clear the form
144
    c_submit, c_clear, c_last = st.columns([1, 1, 5])
145
    with c_submit:
146
        submitted = st.form_submit_button("Find Locations 🌎")
147
    with c_clear:
148
        click_clear = st.form_submit_button(
149
            'Clear text input ⌫', on_click=clear_form)
150
    if submitted:
151
        display_models(selected_models, input, abbreviation_status)
152

153

154
# Drop-down "About" section
155
with st.expander("ℹ️ - About this app", expanded=False):
156
    st.write(
157
        """
158
-   This *Named Entity Recognition model for Singapore Location Names* detects Singaporean addresses, place names and building names from text.
159
-   It was made with [spaCy v3](https://spacy.io/), an open-source library for Natural Language Processing.
160
-   Check out the source code here on my [Github repo](https://github.com/drgnfrts/Singapore-Locations-NER)! :)
161
	    """
162
    )
163
    st.markdown("")
164

165
Product

Resources

Company