Spaces:

riccorl
/

relik-entity-linking

Runtime error

App Files Files Community

riccorl commited on Oct 19, 2023

Commit

626eca0

1 Parent(s): f06d71d

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +332 -0
MANIFEST.in +1 -0
README.md +1 -13
SETUP.cfg +8 -0
app.py +245 -0
dockerfiles/Dockerfile.cpu +17 -0
dockerfiles/Dockerfile.cuda +38 -0
examples/train_retriever.py +45 -0
pyproject.toml +15 -0
relik/__init__.py +1 -0
relik/common/__init__.py +0 -0
relik/common/log.py +97 -0
relik/common/upload.py +128 -0
relik/common/utils.py +609 -0
relik/inference/__init__.py +0 -0
relik/inference/annotator.py +422 -0
relik/inference/data/__init__.py +0 -0
relik/inference/data/objects.py +64 -0
relik/inference/data/tokenizers/__init__.py +89 -0
relik/inference/data/tokenizers/base_tokenizer.py +84 -0
relik/inference/data/tokenizers/regex_tokenizer.py +73 -0
relik/inference/data/tokenizers/spacy_tokenizer.py +228 -0
relik/inference/data/tokenizers/whitespace_tokenizer.py +70 -0
relik/inference/data/window/__init__.py +0 -0
relik/inference/data/window/manager.py +262 -0
relik/inference/gerbil.py +254 -0
relik/inference/preprocessing.py +4 -0
relik/inference/serve/__init__.py +0 -0
relik/inference/serve/backend/__init__.py +0 -0
relik/inference/serve/backend/relik.py +210 -0
relik/inference/serve/backend/retriever.py +206 -0
relik/inference/serve/backend/utils.py +29 -0
relik/inference/serve/frontend/__init__.py +0 -0
relik/inference/serve/frontend/relik.py +231 -0
relik/inference/serve/frontend/style.css +33 -0
relik/reader/__init__.py +0 -0
relik/reader/conf/config.yaml +14 -0
relik/reader/conf/data/base.yaml +21 -0
relik/reader/conf/data/re.yaml +54 -0
relik/reader/conf/training/base.yaml +12 -0
relik/reader/conf/training/re.yaml +12 -0
relik/reader/data/__init__.py +0 -0
relik/reader/data/patches.py +51 -0
relik/reader/data/relik_reader_data.py +965 -0
relik/reader/data/relik_reader_data_utils.py +51 -0
relik/reader/data/relik_reader_sample.py +49 -0
relik/reader/lightning_modules/__init__.py +0 -0
relik/reader/lightning_modules/relik_reader_pl_module.py +50 -0
relik/reader/lightning_modules/relik_reader_re_pl_module.py +54 -0
relik/reader/pytorch_modules/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,332 @@

+# custom
+data/*
+experiments/*
+retrievers
+outputs
+model
+wandb
+# Created by https://www.toptal.com/developers/gitignore/api/jetbrains+all,vscode,python,jupyternotebooks,linux,windows,macos
+# Edit at https://www.toptal.com/developers/gitignore?templates=jetbrains+all,vscode,python,jupyternotebooks,linux,windows,macos
+### JetBrains+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### JetBrains+all Patch ###
+# Ignores the whole .idea folder and all .iml files
+# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
+.idea/
+# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
+*.iml
+modules.xml
+.idea/misc.xml
+*.ipr
+# Sonarlint plugin
+.idea/sonarlint
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+### Linux ###
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+doc/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+# IPython
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# profiling data
+.prof
+### vscode ###
+.vscode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+# End of https://www.toptal.com/developers/gitignore/api/jetbrains+all,vscode,python,jupyternotebooks,linux,windows,macos

MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@


1	+ include requirements.txt

README.md CHANGED Viewed

@@ -1,13 +1 @@
----
-title: Relik
-emoji: 🐨
-colorFrom: gray
-colorTo: pink
-sdk: streamlit
-sdk_version: 1.27.2
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # relik

SETUP.cfg ADDED Viewed

	@@ -0,0 +1,8 @@

+[metadata]
+description-file = README.md
+[build]
+build-base = /tmp/build
+[egg_info]
+egg-base = /tmp

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+import re
+import time
+from pathlib import Path
+import requests
+import streamlit as st
+from spacy import displacy
+from streamlit_extras.badges import badge
+from streamlit_extras.stylable_container import stylable_container
+# RELIK = os.getenv("RELIK", "localhost:8000/api/entities")
+import random
+from relik.inference.annotator import Relik
+def get_random_color(ents):
+    colors = {}
+    random_colors = generate_pastel_colors(len(ents))
+    for ent in ents:
+        colors[ent] = random_colors.pop(random.randint(0, len(random_colors) - 1))
+    return colors
+def floatrange(start, stop, steps):
+    if int(steps) == 1:
+        return [stop]
+    return [
+        start + float(i) * (stop - start) / (float(steps) - 1) for i in range(steps)
+    ]
+def hsl_to_rgb(h, s, l):
+    def hue_2_rgb(v1, v2, v_h):
+        while v_h < 0.0:
+            v_h += 1.0
+        while v_h > 1.0:
+            v_h -= 1.0
+        if 6 * v_h < 1.0:
+            return v1 + (v2 - v1) * 6.0 * v_h
+        if 2 * v_h < 1.0:
+            return v2
+        if 3 * v_h < 2.0:
+            return v1 + (v2 - v1) * ((2.0 / 3.0) - v_h) * 6.0
+        return v1
+    # if not (0 <= s <= 1): raise ValueError, "s (saturation) parameter must be between 0 and 1."
+    # if not (0 <= l <= 1): raise ValueError, "l (lightness) parameter must be between 0 and 1."
+    r, b, g = (l * 255,) * 3
+    if s != 0.0:
+        if l < 0.5:
+            var_2 = l * (1.0 + s)
+        else:
+            var_2 = (l + s) - (s * l)
+        var_1 = 2.0 * l - var_2
+        r = 255 * hue_2_rgb(var_1, var_2, h + (1.0 / 3.0))
+        g = 255 * hue_2_rgb(var_1, var_2, h)
+        b = 255 * hue_2_rgb(var_1, var_2, h - (1.0 / 3.0))
+    return int(round(r)), int(round(g)), int(round(b))
+def generate_pastel_colors(n):
+    """Return different pastel colours.
+    Input:
+        n (integer) : The number of colors to return
+    Output:
+        A list of colors in HTML notation (eg.['#cce0ff', '#ffcccc', '#ccffe0', '#f5ccff', '#f5ffcc'])
+    Example:
+        >>> print generate_pastel_colors(5)
+        ['#cce0ff', '#f5ccff', '#ffcccc', '#f5ffcc', '#ccffe0']
+    """
+    if n == 0:
+        return []
+    # To generate colors, we use the HSL colorspace (see http://en.wikipedia.org/wiki/HSL_color_space)
+    start_hue = 0.6  # 0=red    1/3=0.333=green   2/3=0.666=blue
+    saturation = 1.0
+    lightness = 0.8
+    # We take points around the chromatic circle (hue):
+    # (Note: we generate n+1 colors, then drop the last one ([:-1]) because
+    # it equals the first one (hue 0 = hue 1))
+    return [
+        "#%02x%02x%02x" % hsl_to_rgb(hue, saturation, lightness)
+        for hue in floatrange(start_hue, start_hue + 1, n + 1)
+    ][:-1]
+def set_sidebar(css):
+    white_link_wrapper = "<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css'><a href='{}'>{}</a>"
+    with st.sidebar:
+        st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
+        st.image(
+            "http://nlp.uniroma1.it/static/website/sapienza-nlp-logo-wh.svg",
+            use_column_width=True,
+        )
+        st.markdown("## ReLiK")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("#", "<i class='fa-solid fa-file'></i>&nbsp; Paper")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP/relik", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://hub.docker.com/repository/docker/sapienzanlp/relik", "<i class='fa-brands fa-docker'></i>&nbsp; Docker Hub")}
+                """,
+            unsafe_allow_html=True,
+        )
+        st.markdown("## Sapienza NLP")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("https://nlp.uniroma1.it", "<i class='fa-solid fa-globe'></i>&nbsp; Webpage")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://twitter.com/SapienzaNLP", "<i class='fa-brands fa-twitter'></i>&nbsp; Twitter")}
+                - {white_link_wrapper.format("https://www.linkedin.com/company/79434450", "<i class='fa-brands fa-linkedin'></i>&nbsp; LinkedIn")}
+                """,
+            unsafe_allow_html=True,
+        )
+def get_el_annotations(response):
+    # swap labels key with ents
+    dict_of_ents = {"text": response.text, "ents": []}
+    dict_of_ents["ents"] = response.labels
+    label_in_text = set(l["label"] for l in dict_of_ents["ents"])
+    options = {"ents": label_in_text, "colors": get_random_color(label_in_text)}
+    return dict_of_ents, options
+def set_intro(css):
+    # intro
+    st.markdown("# ReLik")
+    st.markdown(
+        "### Retrieve, Read and LinK: Fast and Accurate Entity Linking and Relation Extraction on an Academic Budget"
+    )
+    # st.markdown(
+    #     "This is a front-end for the paper [Universal Semantic Annotator: the First Unified API "
+    #     "for WSD, SRL and Semantic Parsing](https://www.researchgate.net/publication/360671045_Universal_Semantic_Annotator_the_First_Unified_API_for_WSD_SRL_and_Semantic_Parsing), which will be presented at LREC 2022 by "
+    #     "[Riccardo Orlando](https://riccorl.github.io), [Simone Conia](https://c-simone.github.io/), "
+    #     "[Stefano Faralli](https://corsidilaurea.uniroma1.it/it/users/stefanofaralliuniroma1it), and [Roberto Navigli](https://www.diag.uniroma1.it/navigli/)."
+    # )
+    badge(type="github", name="sapienzanlp/relik")
+    badge(type="pypi", name="relik")
+def run_client():
+    with open(Path(__file__).parent / "style.css") as f:
+        css = f.read()
+    st.set_page_config(
+        page_title="ReLik",
+        page_icon="🦮",
+        layout="wide",
+    )
+    set_sidebar(css)
+    set_intro(css)
+    # text input
+    text = st.text_area(
+        "Enter Text Below:",
+        value="Obama went to Rome for a quick vacation.",
+        height=200,
+        max_chars=500,
+    )
+    with stylable_container(
+        key="annotate_button",
+        css_styles="""
+            button {
+                background-color: #802433;
+                color: white;
+                border-radius: 25px;
+            }
+            """,
+    ):
+        submit = st.button("Annotate")
+    # submit = st.button("Run")
+    relik = Relik(
+        question_encoder="riccorl/relik-retriever-aida-blink-pretrain-omniencoder",
+        document_index="riccorl/index-relik-retriever-aida-blink-pretrain-omniencoder",
+        reader="riccorl/relik-reader-aida-deberta-small",
+        top_k=100,
+        window_size=32,
+        window_stride=16,
+        candidates_preprocessing_fn="relik.inference.preprocessing.wikipedia_title_and_openings_preprocessing",
+    )
+    # ReLik API call
+    if submit:
+        text = text.strip()
+        if text:
+            st.markdown("####")
+            st.markdown("#### Entity Linking")
+            with st.spinner(text="In progress"):
+                response = relik(text)
+                # response = requests.post(RELIK, json=text)
+                # if response.status_code != 200:
+                #     st.error("Error: {}".format(response.status_code))
+                # else:
+                #     response = response.json()
+                    # Entity Linking
+                    # with stylable_container(
+                    #     key="container_with_border",
+                    #     css_styles="""
+                    #         {
+                    #             border: 1px solid rgba(49, 51, 63, 0.2);
+                    #             border-radius: 0.5rem;
+                    #             padding: 0.5rem;
+                    #             padding-bottom: 2rem;
+                    #         }
+                    #         """,
+                    # ):
+                    # st.markdown("##")
+                dict_of_ents, options = get_el_annotations(response=response)
+                display = displacy.render(
+                    dict_of_ents, manual=True, style="ent", options=options
+                )
+                display = display.replace("\n", " ")
+                # wsd_display = re.sub(
+                #     r"(wiki::\d+\w)",
+                #     r"<a href='https://babelnet.org/synset?id=\g<1>&orig=\g<1>&lang={}'>\g<1></a>".format(
+                #         language.upper()
+                #     ),
+                #     wsd_display,
+                # )
+                with st.container():
+                    st.write(display, unsafe_allow_html=True)
+                st.markdown("####")
+                st.markdown("#### Relation Extraction")
+                with st.container():
+                    st.write("Coming :)", unsafe_allow_html=True)
+        else:
+            st.error("Please enter some text.")
+if __name__ == "__main__":
+    run_client()

dockerfiles/Dockerfile.cpu ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM tiangolo/uvicorn-gunicorn:python3.10-slim
+# Copy and install requirements.txt
+COPY ./requirements.txt ./requirements.txt
+COPY ./src /app
+COPY ./scripts/start.sh /start.sh
+COPY ./scripts/prestart.sh /app
+COPY ./scripts/gunicorn_conf.py /gunicorn_conf.py
+COPY ./scripts/start-reload.sh /start-reload.sh
+COPY ./VERSION /
+RUN mkdir -p /app/resources/model \
+    && pip install --no-cache-dir -r requirements.txt \
+    && chmod +x /start.sh && chmod +x /start-reload.sh
+ARG MODEL_PATH
+COPY ${MODEL_PATH}/* /app/resources/model/
+ENV APP_MODULE=main:app

dockerfiles/Dockerfile.cuda ADDED Viewed

	@@ -0,0 +1,38 @@

+FROM nvidia/cuda:12.2.0-base-ubuntu20.04
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update \
+    && apt-get install \
+    curl wget python3.10 \
+    python3.10-distutils \
+    python3-pip \
+    curl wget -y \
+    && rm -rf /var/lib/apt/lists/*
+# FastAPI section
+# device env
+ENV DEVICE="cuda"
+# Copy and install requirements.txt
+COPY ./gpu-requirements.txt ./requirements.txt
+COPY ./src /app
+COPY ./scripts/start.sh /start.sh
+COPY ./scripts/gunicorn_conf.py /gunicorn_conf.py
+COPY ./scripts/start-reload.sh /start-reload.sh
+COPY ./scripts/prestart.sh /app
+COPY ./VERSION /
+RUN mkdir -p /app/resources/model \
+    && pip install --upgrade --no-cache-dir -r requirements.txt \
+    && chmod +x /start.sh \
+    && chmod +x /start-reload.sh
+ARG MODEL_NAME_OR_PATH
+WORKDIR /app
+ENV PYTHONPATH=/app
+EXPOSE 80
+# Run the start script, it will check for an /app/prestart.sh script (e.g. for migrations)
+# And then will start Gunicorn with Uvicorn
+CMD ["/start.sh"]

examples/train_retriever.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from relik.retriever.trainer import RetrieverTrainer
+from relik import GoldenRetriever
+from relik.retriever.indexers.inmemory import InMemoryDocumentIndex
+from relik.retriever.data.datasets import AidaInBatchNegativesDataset
+if __name__ == "__main__":
+    # instantiate retriever
+    document_index = InMemoryDocumentIndex(
+        documents="/root/golden-retriever-v2/data/dpr-like/el/definitions.txt",
+        device="cuda",
+        precision="16",
+    )
+    retriever = GoldenRetriever(
+        question_encoder="intfloat/e5-small-v2", document_index=document_index
+    )
+    train_dataset = AidaInBatchNegativesDataset(
+        name="aida_train",
+        path="/root/golden-retriever-v2/data/dpr-like/el/aida_32_tokens_topic/train.jsonl",
+        tokenizer=retriever.question_tokenizer,
+        question_batch_size=64,
+        passage_batch_size=400,
+        max_passage_length=64,
+        use_topics=True,
+        shuffle=True,
+    )
+    val_dataset = AidaInBatchNegativesDataset(
+        name="aida_val",
+        path="/root/golden-retriever-v2/data/dpr-like/el/aida_32_tokens_topic/val.jsonl",
+        tokenizer=retriever.question_tokenizer,
+        question_batch_size=64,
+        passage_batch_size=400,
+        max_passage_length=64,
+        use_topics=True,
+    )
+    trainer = RetrieverTrainer(
+        retriever=retriever,
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        max_steps=25_000,
+        wandb_offline_mode=True,
+    )
+    trainer.train()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[tool.black]
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.git
+    | \.hg
+    | \.mypy_cache
+    | \.tox
+    | \.venv
+    | _build
+    | buck-out
+    | build
+    | dist
+)/
+'''

relik/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from relik.retriever.pytorch_modules.model import GoldenRetriever

relik/common/__init__.py ADDED Viewed

File without changes

relik/common/log.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import logging
+import sys
+import threading
+from typing import Optional
+from rich import get_console
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+_default_log_level = logging.WARNING
+# fancy logger
+_console = get_console()
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+def _configure_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_default_log_level)
+        library_root_logger.propagate = False
+def _reset_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if not _default_handler:
+            return
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+def set_log_level(level: int, logger: logging.Logger = None) -> None:
+    """
+    Set the log level.
+    Args:
+        level (:obj:`int`):
+            Logging level.
+        logger (:obj:`logging.Logger`):
+            Logger to set the log level.
+    """
+    if not logger:
+        _configure_library_root_logger()
+        logger = _get_library_root_logger()
+    logger.setLevel(level)
+def get_logger(
+    name: Optional[str] = None,
+    level: Optional[int] = None,
+    formatter: Optional[str] = None,
+) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+    """
+    if name is None:
+        name = _get_library_name()
+    _configure_library_root_logger()
+    if level is not None:
+        set_log_level(level)
+    if formatter is None:
+        formatter = logging.Formatter(
+            "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+        )
+    _default_handler.setFormatter(formatter)
+    return logging.getLogger(name)
+def get_console_logger():
+    return _console

relik/common/upload.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import argparse
+import json
+import logging
+import os
+import tempfile
+import zipfile
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Union
+import huggingface_hub
+from relik.common.log import get_logger
+from relik.common.utils import SAPIENZANLP_DATE_FORMAT, get_md5
+logger = get_logger(level=logging.DEBUG)
+def create_info_file(tmpdir: Path):
+    logger.debug("Computing md5 of model.zip")
+    md5 = get_md5(tmpdir / "model.zip")
+    date = datetime.now().strftime(SAPIENZANLP_DATE_FORMAT)
+    logger.debug("Dumping info.json file")
+    with (tmpdir / "info.json").open("w") as f:
+        json.dump(dict(md5=md5, upload_date=date), f, indent=2)
+def zip_run(
+    dir_path: Union[str, os.PathLike],
+    tmpdir: Union[str, os.PathLike],
+    zip_name: str = "model.zip",
+) -> Path:
+    logger.debug(f"zipping {dir_path} to {tmpdir}")
+    # creates a zip version of the provided dir_path
+    run_dir = Path(dir_path)
+    zip_path = tmpdir / zip_name
+    with zipfile.ZipFile(zip_path, "w") as zip_file:
+        # fully zip the run directory maintaining its structure
+        for file in run_dir.rglob("*.*"):
+            if file.is_dir():
+                continue
+            zip_file.write(file, arcname=file.relative_to(run_dir))
+    return zip_path
+def upload(
+    model_dir: Union[str, os.PathLike],
+    model_name: str,
+    organization: Optional[str] = None,
+    repo_name: Optional[str] = None,
+    commit: Optional[str] = None,
+    archive: bool = False,
+):
+    token = huggingface_hub.HfFolder.get_token()
+    if token is None:
+        print(
+            "No HuggingFace token found. You need to execute `huggingface-cli login` first!"
+        )
+        return
+    repo_id = repo_name or model_name
+    if organization is not None:
+        repo_id = f"{organization}/{repo_id}"
+    with tempfile.TemporaryDirectory() as tmpdir:
+        api = huggingface_hub.HfApi()
+        repo_url = api.create_repo(
+            token=token,
+            repo_id=repo_id,
+            exist_ok=True,
+        )
+        repo = huggingface_hub.Repository(
+            str(tmpdir), clone_from=repo_url, use_auth_token=token
+        )
+        tmp_path = Path(tmpdir)
+        if archive:
+            # otherwise we zip the model_dir
+            logger.debug(f"Zipping {model_dir} to {tmp_path}")
+            zip_run(model_dir, tmp_path)
+            create_info_file(tmp_path)
+        else:
+            # if the user wants to upload a transformers model, we don't need to zip it
+            # we just need to copy the files to the tmpdir
+            logger.debug(f"Copying {model_dir} to {tmpdir}")
+            os.system(f"cp -r {model_dir}/* {tmpdir}")
+        # this method automatically puts large files (>10MB) into git lfs
+        repo.push_to_hub(commit_message=commit or "Automatic push from sapienzanlp")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "model_dir", help="The directory of the model you want to upload"
+    )
+    parser.add_argument("model_name", help="The model you want to upload")
+    parser.add_argument(
+        "--organization",
+        help="the name of the organization where you want to upload the model",
+    )
+    parser.add_argument(
+        "--repo_name",
+        help="Optional name to use when uploading to the HuggingFace repository",
+    )
+    parser.add_argument(
+        "--commit", help="Commit message to use when pushing to the HuggingFace Hub"
+    )
+    parser.add_argument(
+        "--archive",
+        action="store_true",
+        help="""
+            Whether to compress the model directory before uploading it.
+            If True, the model directory will be zipped and the zip file will be uploaded.
+            If False, the model directory will be uploaded as is.""",
+    )
+    return parser.parse_args()
+def main():
+    upload(**vars(parse_args()))
+if __name__ == "__main__":
+    main()

relik/common/utils.py ADDED Viewed

	@@ -0,0 +1,609 @@

+import importlib.util
+import json
+import logging
+import os
+import shutil
+import tarfile
+import tempfile
+from functools import partial
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, BinaryIO, Dict, List, Optional, Union
+from urllib.parse import urlparse
+from zipfile import ZipFile, is_zipfile
+import huggingface_hub
+import requests
+import tqdm
+from filelock import FileLock
+from transformers.utils.hub import cached_file as hf_cached_file
+from relik.common.log import get_logger
+# name constants
+WEIGHTS_NAME = "weights.pt"
+ONNX_WEIGHTS_NAME = "weights.onnx"
+CONFIG_NAME = "config.yaml"
+LABELS_NAME = "labels.json"
+# SAPIENZANLP_USER_NAME = "sapienzanlp"
+SAPIENZANLP_USER_NAME = "riccorl"
+SAPIENZANLP_HF_MODEL_REPO_URL = "riccorl/{model_id}"
+SAPIENZANLP_HF_MODEL_REPO_ARCHIVE_URL = (
+    f"{SAPIENZANLP_HF_MODEL_REPO_URL}/resolve/main/model.zip"
+)
+# path constants
+SAPIENZANLP_CACHE_DIR = os.getenv("SAPIENZANLP_CACHE_DIR", Path.home() / ".sapienzanlp")
+SAPIENZANLP_DATE_FORMAT = "%Y-%m-%d %H-%M-%S"
+logger = get_logger(__name__)
+def sapienzanlp_model_urls(model_id: str) -> str:
+    """
+    Returns the URL for a possible SapienzaNLP valid model.
+    Args:
+        model_id (:obj:`str`):
+            A SapienzaNLP model id.
+    Returns:
+        :obj:`str`: The url for the model id.
+    """
+    # check if there is already the namespace of the user
+    if "/" in model_id:
+        return model_id
+    return SAPIENZANLP_HF_MODEL_REPO_URL.format(model_id=model_id)
+def is_package_available(package_name: str) -> bool:
+    """
+    Check if a package is available.
+    Args:
+        package_name (`str`): The name of the package to check.
+    """
+    return importlib.util.find_spec(package_name) is not None
+def load_json(path: Union[str, Path]) -> Any:
+    """
+    Load a json file provided in input.
+    Args:
+        path (`Union[str, Path]`): The path to the json file to load.
+    Returns:
+        `Any`: The loaded json file.
+    """
+    with open(path, encoding="utf8") as f:
+        return json.load(f)
+def dump_json(document: Any, path: Union[str, Path], indent: Optional[int] = None):
+    """
+    Dump input to json file.
+    Args:
+        document (`Any`): The document to dump.
+        path (`Union[str, Path]`): The path to dump the document to.
+        indent (`Optional[int]`): The indent to use for the json file.
+    """
+    with open(path, "w", encoding="utf8") as outfile:
+        json.dump(document, outfile, indent=indent)
+def get_md5(path: Path):
+    """
+    Get the MD5 value of a path.
+    """
+    import hashlib
+    with path.open("rb") as fin:
+        data = fin.read()
+    return hashlib.md5(data).hexdigest()
+def file_exists(path: Union[str, os.PathLike]) -> bool:
+    """
+    Check if the file at :obj:`path` exists.
+    Args:
+        path (:obj:`str`, :obj:`os.PathLike`):
+            Path to check.
+    Returns:
+        :obj:`bool`: :obj:`True` if the file exists.
+    """
+    return Path(path).exists()
+def dir_exists(path: Union[str, os.PathLike]) -> bool:
+    """
+    Check if the directory at :obj:`path` exists.
+    Args:
+        path (:obj:`str`, :obj:`os.PathLike`):
+            Path to check.
+    Returns:
+        :obj:`bool`: :obj:`True` if the directory exists.
+    """
+    return Path(path).is_dir()
+def is_remote_url(url_or_filename: Union[str, Path]):
+    """
+    Returns :obj:`True` if the input path is an url.
+    Args:
+        url_or_filename (:obj:`str`, :obj:`Path`):
+            path to check.
+    Returns:
+        :obj:`bool`: :obj:`True` if the input path is an url, :obj:`False` otherwise.
+    """
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def url_to_filename(resource: str, etag: str = None) -> str:
+    """
+    Convert a `resource` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the resources's, delimited
+    by a period.
+    """
+    resource_bytes = resource.encode("utf-8")
+    resource_hash = sha256(resource_bytes)
+    filename = resource_hash.hexdigest()
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        etag_hash = sha256(etag_bytes)
+        filename += "." + etag_hash.hexdigest()
+    return filename
+def download_resource(
+    url: str,
+    temp_file: BinaryIO,
+    headers=None,
+):
+    """
+    Download remote file.
+    """
+    if headers is None:
+        headers = {}
+    r = requests.get(url, stream=True, headers=headers)
+    r.raise_for_status()
+    content_length = r.headers.get("Content-Length")
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        desc="Downloading",
+        disable=logger.level in [logging.NOTSET],
+    )
+    for chunk in r.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+def download_and_cache(
+    url: Union[str, Path],
+    cache_dir: Union[str, Path] = None,
+    force_download: bool = False,
+):
+    if cache_dir is None:
+        cache_dir = SAPIENZANLP_CACHE_DIR
+    if isinstance(url, Path):
+        url = str(url)
+    # check if cache dir exists
+    Path(cache_dir).mkdir(parents=True, exist_ok=True)
+    # check if file is private
+    headers = {}
+    try:
+        r = requests.head(url, allow_redirects=False, timeout=10)
+        r.raise_for_status()
+    except requests.exceptions.HTTPError:
+        if r.status_code == 401:
+            hf_token = huggingface_hub.HfFolder.get_token()
+            if hf_token is None:
+                raise ValueError(
+                    "You need to login to HuggingFace to download this model "
+                    "(use the `huggingface-cli login` command)"
+                )
+            headers["Authorization"] = f"Bearer {hf_token}"
+    etag = None
+    try:
+        r = requests.head(url, allow_redirects=True, timeout=10, headers=headers)
+        r.raise_for_status()
+        etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+        # We favor a custom header indicating the etag of the linked resource, and
+        # we fallback to the regular etag header.
+        # If we don't have any of those, raise an error.
+        if etag is None:
+            raise OSError(
+                "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+            )
+        # In case of a redirect,
+        # save an extra redirect on the request.get call,
+        # and ensure we download the exact atomic version even if it changed
+        # between the HEAD and the GET (unlikely, but hey).
+        if 300 <= r.status_code <= 399:
+            url = r.headers["Location"]
+    except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+        # Actually raise for those subclasses of ConnectionError
+        raise
+    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+        # Otherwise, our Internet connection is down.
+        # etag is None
+        pass
+    # get filename from the url
+    filename = url_to_filename(url, etag)
+    # get cache path to put the file
+    cache_path = cache_dir / filename
+    # the file is already here, return it
+    if file_exists(cache_path) and not force_download:
+        logger.info(
+            f"{url} found in cache, set `force_download=True` to force the download"
+        )
+        return cache_path
+    cache_path = str(cache_path)
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+        # If the download just completed while the lock was activated.
+        if file_exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+        temp_file_manager = partial(
+            tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+        )
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise, you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info(
+                f"{url} not found in cache or `force_download` set to `True`, downloading to {temp_file.name}"
+            )
+            download_resource(url, temp_file, headers)
+        logger.info(f"storing {url} in cache at {cache_path}")
+        os.replace(temp_file.name, cache_path)
+        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
+        umask = os.umask(0o666)
+        os.umask(umask)
+        os.chmod(cache_path, 0o666 & ~umask)
+        logger.info(f"creating metadata file for {cache_path}")
+        meta = {"url": url}  # , "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+    return cache_path
+def download_from_hf(
+    path_or_repo_id: Union[str, Path],
+    filenames: Optional[List[str]],
+    cache_dir: Union[str, Path] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+):
+    if isinstance(path_or_repo_id, Path):
+        path_or_repo_id = str(path_or_repo_id)
+    downloaded_paths = []
+    for filename in filenames:
+        downloaded_path = hf_cached_file(
+            path_or_repo_id,
+            filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            local_files_only=local_files_only,
+            subfolder=subfolder,
+        )
+        downloaded_paths.append(downloaded_path)
+    # we want the folder where the files are downloaded
+    # the best guess is the parent folder of the first file
+    probably_the_folder = Path(downloaded_paths[0]).parent
+    return probably_the_folder
+def model_name_or_path_resolver(model_name_or_dir: Union[str, os.PathLike]) -> str:
+    """
+    Resolve a model name or directory to a model archive name or directory.
+    Args:
+        model_name_or_dir (:obj:`str` or :obj:`os.PathLike`):
+            A model name or directory.
+    Returns:
+        :obj:`str`: The model archive name or directory.
+    """
+    if is_remote_url(model_name_or_dir):
+        # if model_name_or_dir is a URL
+        # download it and try to load
+        model_archive = model_name_or_dir
+    elif Path(model_name_or_dir).is_dir() or Path(model_name_or_dir).is_file():
+        # if model_name_or_dir is a local directory or
+        # an archive file try to load it
+        model_archive = model_name_or_dir
+    else:
+        # probably model_name_or_dir is a sapienzanlp model id
+        # guess the url and try to download
+        model_name_or_dir_ = model_name_or_dir
+        # raise ValueError(f"Providing a model id is not supported yet.")
+        model_archive = sapienzanlp_model_urls(model_name_or_dir_)
+    return model_archive
+def from_cache(
+    url_or_filename: Union[str, Path],
+    cache_dir: Union[str, Path] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    filenames: Optional[List[str]] = None,
+) -> Path:
+    """
+    Given something that could be either a local path or a URL (or a SapienzaNLP model id),
+    determine which one and return a path to the corresponding file.
+    Args:
+        url_or_filename (:obj:`str` or :obj:`Path`):
+            A path to a local file or a URL (or a SapienzaNLP model id).
+        cache_dir (:obj:`str` or :obj:`Path`, `optional`):
+            Path to a directory in which a downloaded file will be cached.
+        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to re-download the file even if it already exists.
+        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to delete incompletely received files. Attempts to resume the download if such a file
+            exists.
+        proxies (:obj:`Dict[str, str]`, `optional`):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (:obj:`Union[bool, str]`, `optional`):
+            Optional string or boolean to use as Bearer token for remote files. If :obj:`True`, will get token from
+            :obj:`~transformers.hf_api.HfApi`. If :obj:`str`, will use that string as token.
+        revision (:obj:`str`, `optional`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+            identifier allowed by git.
+        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to raise an error if the file to be downloaded is local.
+        subfolder (:obj:`str`, `optional`):
+            In case the relevant file is in a subfolder of the URL, specify it here.
+        filenames (:obj:`List[str]`, `optional`):
+            List of filenames to look for in the directory structure.
+    Returns:
+        :obj:`Path`: Path to the cached file.
+    """
+    url_or_filename = model_name_or_path_resolver(url_or_filename)
+    if cache_dir is None:
+        cache_dir = SAPIENZANLP_CACHE_DIR
+    if file_exists(url_or_filename):
+        logger.info(f"{url_or_filename} is a local path or file")
+        output_path = url_or_filename
+    elif is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = download_and_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+        )
+    else:
+        if filenames is None:
+            filenames = [WEIGHTS_NAME, CONFIG_NAME, LABELS_NAME]
+        output_path = download_from_hf(
+            url_or_filename,
+            filenames,
+            cache_dir,
+            force_download,
+            resume_download,
+            proxies,
+            use_auth_token,
+            revision,
+            local_files_only,
+            subfolder,
+        )
+    # if is_hf_hub_url(url_or_filename):
+    # HuggingFace Hub
+    # output_path = hf_hub_download_url(url_or_filename)
+    # elif is_remote_url(url_or_filename):
+    #     # URL, so get it from the cache (downloading if necessary)
+    #     output_path = download_and_cache(
+    #         url_or_filename,
+    #         cache_dir=cache_dir,
+    #         force_download=force_download,
+    #     )
+    # elif file_exists(url_or_filename):
+    #     logger.info(f"{url_or_filename} is a local path or file")
+    #     # File, and it exists.
+    #     output_path = url_or_filename
+    # elif urlparse(url_or_filename).scheme == "":
+    #     # File, but it doesn't exist.
+    #     raise EnvironmentError(f"file {url_or_filename} not found")
+    # else:
+    #     # Something unknown
+    #     raise ValueError(
+    #         f"unable to parse {url_or_filename} as a URL or as a local path"
+    #     )
+    if dir_exists(output_path) or (
+        not is_zipfile(output_path) and not tarfile.is_tarfile(output_path)
+    ):
+        return Path(output_path)
+    # Path where we extract compressed archives
+    # for now it will extract it in the same folder
+    # maybe implement extraction in the sapienzanlp folder
+    # when using local archive path?
+    logger.info("Extracting compressed archive")
+    output_dir, output_file = os.path.split(output_path)
+    output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+    output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+    # already extracted, do not extract
+    if (
+        os.path.isdir(output_path_extracted)
+        and os.listdir(output_path_extracted)
+        and not force_download
+    ):
+        return Path(output_path_extracted)
+    # Prevent parallel extractions
+    lock_path = output_path + ".lock"
+    with FileLock(lock_path):
+        shutil.rmtree(output_path_extracted, ignore_errors=True)
+        os.makedirs(output_path_extracted)
+        if is_zipfile(output_path):
+            with ZipFile(output_path, "r") as zip_file:
+                zip_file.extractall(output_path_extracted)
+                zip_file.close()
+        elif tarfile.is_tarfile(output_path):
+            tar_file = tarfile.open(output_path)
+            tar_file.extractall(output_path_extracted)
+            tar_file.close()
+        else:
+            raise EnvironmentError(
+                f"Archive format of {output_path} could not be identified"
+            )
+    # remove lock file, is it safe?
+    os.remove(lock_path)
+    return Path(output_path_extracted)
+def is_str_a_path(maybe_path: str) -> bool:
+    """
+    Check if a string is a path.
+    Args:
+        maybe_path (`str`): The string to check.
+    Returns:
+        `bool`: `True` if the string is a path, `False` otherwise.
+    """
+    # first check if it is a path
+    if Path(maybe_path).exists():
+        return True
+    # check if it is a relative path
+    if Path(os.path.join(os.getcwd(), maybe_path)).exists():
+        return True
+    # otherwise it is not a path
+    return False
+def relative_to_absolute_path(path: str) -> os.PathLike:
+    """
+    Convert a relative path to an absolute path.
+    Args:
+        path (`str`): The relative path to convert.
+    Returns:
+        `os.PathLike`: The absolute path.
+    """
+    if not is_str_a_path(path):
+        raise ValueError(f"{path} is not a path")
+    if Path(path).exists():
+        return Path(path).absolute()
+    if Path(os.path.join(os.getcwd(), path)).exists():
+        return Path(os.path.join(os.getcwd(), path)).absolute()
+    raise ValueError(f"{path} is not a path")
+def to_config(object_to_save: Any) -> Dict[str, Any]:
+    """
+    Convert an object to a dictionary.
+    Returns:
+        `Dict[str, Any]`: The dictionary representation of the object.
+    """
+    def obj_to_dict(obj):
+        match obj:
+            case dict():
+                data = {}
+                for k, v in obj.items():
+                    data[k] = obj_to_dict(v)
+                return data
+            case list() | tuple():
+                return [obj_to_dict(x) for x in obj]
+            case object(__dict__=_):
+                data = {
+                    "_target_": f"{obj.__class__.__module__}.{obj.__class__.__name__}",
+                }
+                for k, v in obj.__dict__.items():
+                    if not k.startswith("_"):
+                        data[k] = obj_to_dict(v)
+                return data
+            case _:
+                return obj
+    return obj_to_dict(object_to_save)
+def get_callable_from_string(callable_fn: str) -> Any:
+    """
+    Get a callable from a string.
+    Args:
+        callable_fn (`str`):
+            The string representation of the callable.
+    Returns:
+        `Any`: The callable.
+    """
+    # separate the function name from the module name
+    module_name, function_name = callable_fn.rsplit(".", 1)
+    # import the module
+    module = importlib.import_module(module_name)
+    # get the function
+    return getattr(module, function_name)

relik/inference/__init__.py ADDED Viewed

File without changes

relik/inference/annotator.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import os
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
+import hydra
+from omegaconf import OmegaConf
+from relik.retriever.pytorch_modules.hf import GoldenRetrieverModel
+from rich.pretty import pprint
+from relik.common.log import get_console_logger, get_logger
+from relik.common.upload import upload
+from relik.common.utils import CONFIG_NAME, from_cache, get_callable_from_string
+from relik.inference.data.objects import EntitySpan, RelikOutput
+from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer
+from relik.inference.data.window.manager import WindowManager
+from relik.reader.pytorch_modules.span import RelikReaderForSpanExtraction
+from relik.reader.relik_reader import RelikReader
+from relik.retriever.data.utils import batch_generator
+from relik.retriever.indexers.base import BaseDocumentIndex
+from relik.retriever.pytorch_modules.model import GoldenRetriever
+logger = get_logger(__name__)
+console_logger = get_console_logger()
+class Relik:
+    """
+    Relik main class. It is a wrapper around a retriever and a reader.
+    Args:
+        retriever (`Optional[GoldenRetriever]`, `optional`):
+            The retriever to use. If `None`, a retriever will be instantiated from the
+            provided `question_encoder`, `passage_encoder` and `document_index`.
+            Defaults to `None`.
+        question_encoder (`Optional[Union[str, GoldenRetrieverModel]]`, `optional`):
+            The question encoder to use. If `retriever` is `None`, a retriever will be
+            instantiated from this parameter. Defaults to `None`.
+        passage_encoder (`Optional[Union[str, GoldenRetrieverModel]]`, `optional`):
+            The passage encoder to use. If `retriever` is `None`, a retriever will be
+            instantiated from this parameter. Defaults to `None`.
+        document_index (`Optional[Union[str, BaseDocumentIndex]]`, `optional`):
+            The document index to use. If `retriever` is `None`, a retriever will be
+            instantiated from this parameter. Defaults to `None`.
+        reader (`Optional[Union[str, RelikReader]]`, `optional`):
+            The reader to use. If `None`, a reader will be instantiated from the
+            provided `reader`. Defaults to `None`.
+        retriever_device (`str`, `optional`, defaults to `cpu`):
+            The device to use for the retriever.
+    """
+    def __init__(
+        self,
+        retriever: GoldenRetriever | None = None,
+        question_encoder: str | GoldenRetrieverModel | None = None,
+        passage_encoder: str | GoldenRetrieverModel | None = None,
+        document_index: str | BaseDocumentIndex | None = None,
+        reader: str | RelikReader | None = None,
+        device: str = "cpu",
+        retriever_device: str | None = None,
+        document_index_device: str | None = None,
+        reader_device: str | None = None,
+        precision: int = 32,
+        retriever_precision: int | None = None,
+        document_index_precision: int | None = None,
+        reader_precision: int | None = None,
+        reader_kwargs: dict | None = None,
+        retriever_kwargs: dict | None = None,
+        candidates_preprocessing_fn: str | Callable | None = None,
+        top_k: int | None = None,
+        window_size: int | None = None,
+        window_stride: int | None = None,
+        **kwargs,
+    ) -> None:
+        # retriever
+        retriever_device = retriever_device or device
+        document_index_device = document_index_device or device
+        retriever_precision = retriever_precision or precision
+        document_index_precision = document_index_precision or precision
+        if retriever is None and question_encoder is None:
+            raise ValueError(
+                "Either `retriever` or `question_encoder` must be provided"
+            )
+        if retriever is None:
+            self.retriever_kwargs = dict(
+                question_encoder=question_encoder,
+                passage_encoder=passage_encoder,
+                document_index=document_index,
+                device=retriever_device,
+                precision=retriever_precision,
+                index_device=document_index_device,
+                index_precision=document_index_precision,
+            )
+            # overwrite default_retriever_kwargs with retriever_kwargs
+            self.retriever_kwargs.update(retriever_kwargs or {})
+            retriever = GoldenRetriever(**self.retriever_kwargs)
+        retriever.training = False
+        retriever.eval()
+        self.retriever = retriever
+        # reader
+        self.reader_device = reader_device or device
+        self.reader_precision = reader_precision or precision
+        self.reader_kwargs = reader_kwargs
+        if isinstance(reader, str):
+            reader_kwargs = reader_kwargs or {}
+            reader = RelikReaderForSpanExtraction(reader, **reader_kwargs)
+        self.reader = reader
+        # windowization stuff
+        self.tokenizer = SpacyTokenizer(language="en")
+        self.window_manager: WindowManager | None = None
+        # candidates preprocessing
+        # TODO: maybe move this logic somewhere else
+        candidates_preprocessing_fn = candidates_preprocessing_fn or (lambda x: x)
+        if isinstance(candidates_preprocessing_fn, str):
+            candidates_preprocessing_fn = get_callable_from_string(
+                candidates_preprocessing_fn
+            )
+        self.candidates_preprocessing_fn = candidates_preprocessing_fn
+        # inference params
+        self.top_k = top_k
+        self.window_size = window_size
+        self.window_stride = window_stride
+    def __call__(
+        self,
+        text: Union[str, list],
+        top_k: Optional[int] = None,
+        window_size: Optional[int] = None,
+        window_stride: Optional[int] = None,
+        retriever_batch_size: Optional[int] = 32,
+        reader_batch_size: Optional[int] = 32,
+        return_also_windows: bool = False,
+        **kwargs,
+    ) -> Union[RelikOutput, list[RelikOutput]]:
+        """
+        Annotate a text with entities.
+        Args:
+            text (`str` or `list`):
+                The text to annotate. If a list is provided, each element of the list
+                 will be annotated separately.
+            top_k (`int`, `optional`, defaults to `None`):
+                The number of candidates to retrieve for each window.
+            window_size (`int`, `optional`, defaults to `None`):
+                The size of the window. If `None`, the whole text will be annotated.
+            window_stride (`int`, `optional`, defaults to `None`):
+                The stride of the window. If `None`, there will be no overlap between windows.
+            retriever_batch_size (`int`, `optional`, defaults to `None`):
+                The batch size to use for the retriever. The whole input is the batch for the retriever.
+            reader_batch_size (`int`, `optional`, defaults to `None`):
+                The batch size to use for the reader. The whole input is the batch for the reader.
+            return_also_windows (`bool`, `optional`, defaults to `False`):
+                Whether to return the windows in the output.
+            **kwargs:
+                Additional keyword arguments to pass to the retriever and the reader.
+        Returns:
+            `RelikOutput` or `list[RelikOutput]`:
+                The annotated text. If a list was provided as input, a list of
+                `RelikOutput` objects will be returned.
+        """
+        if top_k is None:
+            top_k = self.top_k or 100
+        if window_size is None:
+            window_size = self.window_size
+        if window_stride is None:
+            window_stride = self.window_stride
+        if isinstance(text, str):
+            text = [text]
+        if window_size is not None:
+            if self.window_manager is None:
+                self.window_manager = WindowManager(self.tokenizer)
+        if window_size == "sentence":
+            # todo: implement sentence windowizer
+            raise NotImplementedError("Sentence windowizer not implemented yet")
+        # if window_size < window_stride:
+        #     raise ValueError(
+        #         f"Window size ({window_size}) must be greater than window stride ({window_stride})"
+        #     )
+        # window generator
+        windows = [
+            window
+            for doc_id, t in enumerate(text)
+            for window in self.window_manager.create_windows(
+                t,
+                window_size=window_size,
+                stride=window_stride,
+                doc_id=doc_id,
+            )
+        ]
+        # retrieve candidates first
+        windows_candidates = []
+        # TODO: Move batching inside retriever
+        for batch in batch_generator(windows, batch_size=retriever_batch_size):
+            retriever_out = self.retriever.retrieve([b.text for b in batch], k=top_k)
+            windows_candidates.extend(
+                [[p.label for p in predictions] for predictions in retriever_out]
+            )
+        # add passage to the windows
+        for window, candidates in zip(windows, windows_candidates):
+            window.window_candidates = [
+                self.candidates_preprocessing_fn(c) for c in candidates
+            ]
+        windows = self.reader.read(samples=windows, max_batch_size=reader_batch_size)
+        windows = self.window_manager.merge_windows(windows)
+        # transform predictions into RelikOutput objects
+        output = []
+        for w in windows:
+            sample_output = RelikOutput(
+                text=text[w.doc_id],
+                labels=sorted(
+                    [
+                        EntitySpan(
+                            start=ss, end=se, label=sl, text=text[w.doc_id][ss:se]
+                        )
+                        for ss, se, sl in w.predicted_window_labels_chars
+                    ],
+                    key=lambda x: x.start,
+                ),
+            )
+            output.append(sample_output)
+        if return_also_windows:
+            for i, sample_output in enumerate(output):
+                sample_output.windows = [w for w in windows if w.doc_id == i]
+        # if only one text was provided, return a single RelikOutput object
+        if len(output) == 1:
+            return output[0]
+        return output
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_dir: Union[str, os.PathLike],
+        config_kwargs: Optional[Dict] = None,
+        config_file_name: str = CONFIG_NAME,
+        *args,
+        **kwargs,
+    ) -> "Relik":
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        model_dir = from_cache(
+            model_name_or_dir,
+            filenames=[config_file_name],
+            cache_dir=cache_dir,
+            force_download=force_download,
+        )
+        config_path = model_dir / config_file_name
+        if not config_path.exists():
+            raise FileNotFoundError(
+                f"Model configuration file not found at {config_path}."
+            )
+        # overwrite config with config_kwargs
+        config = OmegaConf.load(config_path)
+        if config_kwargs is not None:
+            # TODO: check merging behavior
+            config = OmegaConf.merge(config, OmegaConf.create(config_kwargs))
+        # do we want to print the config? I like it
+        pprint(OmegaConf.to_container(config), console=console_logger, expand_all=True)
+        # load relik from config
+        relik = hydra.utils.instantiate(config, *args, **kwargs)
+        return relik
+    def save_pretrained(
+        self,
+        output_dir: Union[str, os.PathLike],
+        config: Optional[Dict[str, Any]] = None,
+        config_file_name: Optional[str] = None,
+        save_weights: bool = False,
+        push_to_hub: bool = False,
+        model_id: Optional[str] = None,
+        organization: Optional[str] = None,
+        repo_name: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Save the configuration of Relik to the specified directory as a YAML file.
+        Args:
+            output_dir (`str`):
+                The directory to save the configuration file to.
+            config (`Optional[Dict[str, Any]]`, `optional`):
+                The configuration to save. If `None`, the current configuration will be
+                saved. Defaults to `None`.
+            config_file_name (`Optional[str]`, `optional`):
+                The name of the configuration file. Defaults to `config.yaml`.
+            save_weights (`bool`, `optional`):
+                Whether to save the weights of the model. Defaults to `False`.
+            push_to_hub (`bool`, `optional`):
+                Whether to push the saved model to the hub. Defaults to `False`.
+            model_id (`Optional[str]`, `optional`):
+                The id of the model to push to the hub. If `None`, the name of the
+                directory will be used. Defaults to `None`.
+            organization (`Optional[str]`, `optional`):
+                The organization to push the model to. Defaults to `None`.
+            repo_name (`Optional[str]`, `optional`):
+                The name of the repository to push the model to. Defaults to `None`.
+            **kwargs:
+                Additional keyword arguments to pass to `OmegaConf.save`.
+        """
+        if config is None:
+            # create a default config
+            config = {
+                "_target_": f"{self.__class__.__module__}.{self.__class__.__name__}"
+            }
+            if self.retriever is not None:
+                if self.retriever.question_encoder is not None:
+                    config[
+                        "question_encoder"
+                    ] = self.retriever.question_encoder.name_or_path
+                if self.retriever.passage_encoder is not None:
+                    config[
+                        "passage_encoder"
+                    ] = self.retriever.passage_encoder.name_or_path
+                if self.retriever.document_index is not None:
+                    config["document_index"] = self.retriever.document_index.name_or_dir
+            if self.reader is not None:
+                config["reader"] = self.reader.model_path
+            config["retriever_kwargs"] = self.retriever_kwargs
+            config["reader_kwargs"] = self.reader_kwargs
+            # expand the fn as to be able to save it and load it later
+            config[
+                "candidates_preprocessing_fn"
+            ] = f"{self.candidates_preprocessing_fn.__module__}.{self.candidates_preprocessing_fn.__name__}"
+            # these are model-specific and should be saved
+            config["top_k"] = self.top_k
+            config["window_size"] = self.window_size
+            config["window_stride"] = self.window_stride
+        config_file_name = config_file_name or CONFIG_NAME
+        # create the output directory
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Saving relik config to {output_dir / config_file_name}")
+        # pretty print the config
+        pprint(config, console=console_logger, expand_all=True)
+        OmegaConf.save(config, output_dir / config_file_name)
+        if save_weights:
+            model_id = model_id or output_dir.name
+            retriever_model_id = model_id + "-retriever"
+            # save weights
+            logger.info(f"Saving retriever to {output_dir / retriever_model_id}")
+            self.retriever.save_pretrained(
+                output_dir / retriever_model_id,
+                question_encoder_name=retriever_model_id + "-question-encoder",
+                passage_encoder_name=retriever_model_id + "-passage-encoder",
+                document_index_name=retriever_model_id + "-index",
+                push_to_hub=push_to_hub,
+                organization=organization,
+                repo_name=repo_name,
+                **kwargs,
+            )
+            reader_model_id = model_id + "-reader"
+            logger.info(f"Saving reader to {output_dir / reader_model_id}")
+            self.reader.save_pretrained(
+                output_dir / reader_model_id,
+                push_to_hub=push_to_hub,
+                organization=organization,
+                repo_name=repo_name,
+                **kwargs,
+            )
+        if push_to_hub:
+            # push to hub
+            logger.info(f"Pushing to hub")
+            model_id = model_id or output_dir.name
+            upload(output_dir, model_id, organization=organization, repo_name=repo_name)
+def main():
+    from pprint import pprint
+    relik = Relik(
+        question_encoder="riccorl/relik-retriever-aida-blink-pretrain-omniencoder",
+        document_index="riccorl/index-relik-retriever-aida-blink-pretrain-omniencoder",
+        reader="riccorl/relik-reader-aida-deberta-small",
+        device="cuda",
+        precision=16,
+        top_k=100,
+        window_size=32,
+        window_stride=16,
+        candidates_preprocessing_fn="relik.inference.preprocessing.wikipedia_title_and_openings_preprocessing",
+    )
+    input_text = """
+    Bernie Ecclestone, the former boss of Formula One, has admitted fraud after failing to declare more than £400m held in a trust in Singapore.
+    The 92-year-old billionaire did not disclose the trust to the government in July 2015.
+    Appearing at Southwark Crown Court on Thursday, he told the judge "I plead guilty" after having previously pleaded not guilty.
+    Ecclestone had been due to go on trial next month.
+    """
+    preds = relik(input_text)
+    pprint(preds)
+if __name__ == "__main__":
+    main()

relik/inference/data/__init__.py ADDED Viewed

File without changes

relik/inference/data/objects.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, NamedTuple, Optional
+from relik.reader.pytorch_modules.hf.modeling_relik import RelikReaderSample
+@dataclass
+class Word:
+    """
+    A word representation that includes text, index in the sentence, POS tag, lemma,
+    dependency relation, and similar information.
+    # Parameters
+    text : `str`, optional
+        The text representation.
+    index : `int`, optional
+        The word offset in the sentence.
+    lemma : `str`, optional
+        The lemma of this word.
+    pos : `str`, optional
+        The coarse-grained part of speech of this word.
+    dep : `str`, optional
+        The dependency relation for this word.
+    input_id : `int`, optional
+        Integer representation of the word, used to pass it to a model.
+    token_type_id : `int`, optional
+        Token type id used by some transformers.
+    attention_mask: `int`, optional
+        Attention mask used by transformers, indicates to the model which tokens should
+        be attended to, and which should not.
+    """
+    text: str
+    index: int
+    start_char: Optional[int] = None
+    end_char: Optional[int] = None
+    # preprocessing fields
+    lemma: Optional[str] = None
+    pos: Optional[str] = None
+    dep: Optional[str] = None
+    head: Optional[int] = None
+    def __str__(self):
+        return self.text
+    def __repr__(self):
+        return self.__str__()
+class EntitySpan(NamedTuple):
+    start: int
+    end: int
+    label: str
+    text: str
+@dataclass
+class RelikOutput:
+    text: str
+    labels: List[EntitySpan]
+    windows: Optional[List[RelikReaderSample]] = None

relik/inference/data/tokenizers/__init__.py ADDED Viewed

	@@ -0,0 +1,89 @@

+SPACY_LANGUAGE_MAPPER = {
+    "ca": "ca_core_news_sm",
+    "da": "da_core_news_sm",
+    "de": "de_core_news_sm",
+    "el": "el_core_news_sm",
+    "en": "en_core_web_sm",
+    "es": "es_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "ja": "ja_core_news_sm",
+    "lt": "lt_core_news_sm",
+    "mk": "mk_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "pl": "pl_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "ro": "ro_core_news_sm",
+    "ru": "ru_core_news_sm",
+    "xx": "xx_sent_ud_sm",
+    "zh": "zh_core_web_sm",
+    "ca_core_news_sm": "ca_core_news_sm",
+    "ca_core_news_md": "ca_core_news_md",
+    "ca_core_news_lg": "ca_core_news_lg",
+    "ca_core_news_trf": "ca_core_news_trf",
+    "da_core_news_sm": "da_core_news_sm",
+    "da_core_news_md": "da_core_news_md",
+    "da_core_news_lg": "da_core_news_lg",
+    "da_core_news_trf": "da_core_news_trf",
+    "de_core_news_sm": "de_core_news_sm",
+    "de_core_news_md": "de_core_news_md",
+    "de_core_news_lg": "de_core_news_lg",
+    "de_dep_news_trf": "de_dep_news_trf",
+    "el_core_news_sm": "el_core_news_sm",
+    "el_core_news_md": "el_core_news_md",
+    "el_core_news_lg": "el_core_news_lg",
+    "en_core_web_sm": "en_core_web_sm",
+    "en_core_web_md": "en_core_web_md",
+    "en_core_web_lg": "en_core_web_lg",
+    "en_core_web_trf": "en_core_web_trf",
+    "es_core_news_sm": "es_core_news_sm",
+    "es_core_news_md": "es_core_news_md",
+    "es_core_news_lg": "es_core_news_lg",
+    "es_dep_news_trf": "es_dep_news_trf",
+    "fr_core_news_sm": "fr_core_news_sm",
+    "fr_core_news_md": "fr_core_news_md",
+    "fr_core_news_lg": "fr_core_news_lg",
+    "fr_dep_news_trf": "fr_dep_news_trf",
+    "it_core_news_sm": "it_core_news_sm",
+    "it_core_news_md": "it_core_news_md",
+    "it_core_news_lg": "it_core_news_lg",
+    "ja_core_news_sm": "ja_core_news_sm",
+    "ja_core_news_md": "ja_core_news_md",
+    "ja_core_news_lg": "ja_core_news_lg",
+    "ja_dep_news_trf": "ja_dep_news_trf",
+    "lt_core_news_sm": "lt_core_news_sm",
+    "lt_core_news_md": "lt_core_news_md",
+    "lt_core_news_lg": "lt_core_news_lg",
+    "mk_core_news_sm": "mk_core_news_sm",
+    "mk_core_news_md": "mk_core_news_md",
+    "mk_core_news_lg": "mk_core_news_lg",
+    "nb_core_news_sm": "nb_core_news_sm",
+    "nb_core_news_md": "nb_core_news_md",
+    "nb_core_news_lg": "nb_core_news_lg",
+    "nl_core_news_sm": "nl_core_news_sm",
+    "nl_core_news_md": "nl_core_news_md",
+    "nl_core_news_lg": "nl_core_news_lg",
+    "pl_core_news_sm": "pl_core_news_sm",
+    "pl_core_news_md": "pl_core_news_md",
+    "pl_core_news_lg": "pl_core_news_lg",
+    "pt_core_news_sm": "pt_core_news_sm",
+    "pt_core_news_md": "pt_core_news_md",
+    "pt_core_news_lg": "pt_core_news_lg",
+    "ro_core_news_sm": "ro_core_news_sm",
+    "ro_core_news_md": "ro_core_news_md",
+    "ro_core_news_lg": "ro_core_news_lg",
+    "ru_core_news_sm": "ru_core_news_sm",
+    "ru_core_news_md": "ru_core_news_md",
+    "ru_core_news_lg": "ru_core_news_lg",
+    "xx_ent_wiki_sm": "xx_ent_wiki_sm",
+    "xx_sent_ud_sm": "xx_sent_ud_sm",
+    "zh_core_web_sm": "zh_core_web_sm",
+    "zh_core_web_md": "zh_core_web_md",
+    "zh_core_web_lg": "zh_core_web_lg",
+    "zh_core_web_trf": "zh_core_web_trf",
+}
+from relik.inference.data.tokenizers.regex_tokenizer import RegexTokenizer
+from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer
+from relik.inference.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

relik/inference/data/tokenizers/base_tokenizer.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import List, Union
+from relik.inference.data.objects import Word
+class BaseTokenizer:
+    """
+    A :obj:`Tokenizer` splits strings of text into single words, optionally adds
+    pos tags and perform lemmatization.
+    """
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs
+    ) -> List[List[Word]]:
+        """
+        Tokenize the input into single words.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`List[List[Word]]`: The input text tokenized in single words.
+        """
+        raise NotImplementedError
+    def tokenize(self, text: str) -> List[Word]:
+        """
+        Implements splitting words into tokens.
+        Args:
+            text (:obj:`str`):
+                Text to tokenize.
+        Returns:
+            :obj:`List[Word]`: The input text tokenized in single words.
+        """
+        raise NotImplementedError
+    def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
+        """
+        Implements batch splitting words into tokens.
+        Args:
+            texts (:obj:`List[str]`):
+                Batch of text to tokenize.
+        Returns:
+            :obj:`List[List[Word]]`: The input batch tokenized in single words.
+        """
+        return [self.tokenize(text) for text in texts]
+    @staticmethod
+    def check_is_batched(
+        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
+    ):
+        """
+        Check if input is batched or a single sample.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to check.
+            is_split_into_words (:obj:`bool`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
+        """
+        return bool(
+            (not is_split_into_words and isinstance(texts, (list, tuple)))
+            or (
+                is_split_into_words
+                and isinstance(texts, (list, tuple))
+                and texts
+                and isinstance(texts[0], (list, tuple))
+            )
+        )

relik/inference/data/tokenizers/regex_tokenizer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import re
+from typing import List, Union
+from overrides import overrides
+from relik.inference.data.objects import Word
+from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
+class RegexTokenizer(BaseTokenizer):
+    """
+    A :obj:`Tokenizer` that splits the text based on a simple regex.
+    """
+    def __init__(self):
+        super(RegexTokenizer, self).__init__()
+        # regex for splitting on spaces and punctuation and new lines
+        # self._regex = re.compile(r"\S+|[\[\](),.!?;:\"]|\\n")
+        self._regex = re.compile(
+            r"\w+|\$[\d\.]+|\S+", re.UNICODE | re.MULTILINE | re.DOTALL
+        )
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs,
+    ) -> List[List[Word]]:
+        """
+        Tokenize the input into single words by splitting using a simple regex.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`List[List[Word]]`: The input text tokenized in single words.
+        Example::
+            >>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer
+            >>> regex_tokenizer = RegexTokenizer()
+            >>> regex_tokenizer("Mary sold the car to John.")
+        """
+        # check if input is batched or a single sample
+        is_batched = self.check_is_batched(texts, is_split_into_words)
+        if is_batched:
+            tokenized = self.tokenize_batch(texts)
+        else:
+            tokenized = self.tokenize(texts)
+        return tokenized
+    @overrides
+    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
+        if not isinstance(text, (str, list)):
+            raise ValueError(
+                f"text must be either `str` or `list`, found: `{type(text)}`"
+            )
+        if isinstance(text, list):
+            text = " ".join(text)
+        return [
+            Word(t[0], i, start_char=t[1], end_char=t[2])
+            for i, t in enumerate(
+                (m.group(0), m.start(), m.end()) for m in self._regex.finditer(text)
+            )
+        ]

relik/inference/data/tokenizers/spacy_tokenizer.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import logging
+from typing import Dict, List, Tuple, Union
+import spacy
+# from ipa.common.utils import load_spacy
+from overrides import overrides
+from spacy.cli.download import download as spacy_download
+from spacy.tokens import Doc
+from relik.common.log import get_logger
+from relik.inference.data.objects import Word
+from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER
+from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
+logger = get_logger(level=logging.DEBUG)
+# Spacy and Stanza stuff
+LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {}
+def load_spacy(
+    language: str,
+    pos_tags: bool = False,
+    lemma: bool = False,
+    parse: bool = False,
+    split_on_spaces: bool = False,
+) -> spacy.Language:
+    """
+    Download and load spacy model.
+    Args:
+        language (:obj:`str`, defaults to :obj:`en`):
+            Language of the text to tokenize.
+        pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs POS tagging with spacy model.
+        lemma (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs lemmatization with spacy model.
+        parse (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs dependency parsing with spacy model.
+        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, will split by spaces without performing tokenization.
+    Returns:
+        :obj:`spacy.Language`: The spacy model loaded.
+    """
+    exclude = ["vectors", "textcat", "ner"]
+    if not pos_tags:
+        exclude.append("tagger")
+    if not lemma:
+        exclude.append("lemmatizer")
+    if not parse:
+        exclude.append("parser")
+    # check if the model is already loaded
+    # if so, there is no need to reload it
+    spacy_params = (language, pos_tags, lemma, parse, split_on_spaces)
+    if spacy_params not in LOADED_SPACY_MODELS:
+        try:
+            spacy_tagger = spacy.load(language, exclude=exclude)
+        except OSError:
+            logger.warning(
+                "Spacy model '%s' not found. Downloading and installing.", language
+            )
+            spacy_download(language)
+            spacy_tagger = spacy.load(language, exclude=exclude)
+        # if everything is disabled, return only the tokenizer
+        # for faster tokenization
+        # TODO: is it really faster?
+        # if len(exclude) >= 6:
+        #     spacy_tagger = spacy_tagger.tokenizer
+        LOADED_SPACY_MODELS[spacy_params] = spacy_tagger
+    return LOADED_SPACY_MODELS[spacy_params]
+class SpacyTokenizer(BaseTokenizer):
+    """
+    A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects.
+    Args:
+        language (:obj:`str`, optional, defaults to :obj:`en`):
+            Language of the text to tokenize.
+        return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs POS tagging with spacy model.
+        return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs lemmatization with spacy model.
+        return_deps (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs dependency parsing with spacy model.
+        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, will split by spaces without performing tokenization.
+        use_gpu (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, will load the Stanza model on GPU.
+    """
+    def __init__(
+        self,
+        language: str = "en",
+        return_pos_tags: bool = False,
+        return_lemmas: bool = False,
+        return_deps: bool = False,
+        split_on_spaces: bool = False,
+        use_gpu: bool = False,
+    ):
+        super(SpacyTokenizer, self).__init__()
+        if language not in SPACY_LANGUAGE_MAPPER:
+            raise ValueError(
+                f"`{language}` language not supported. The supported "
+                f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}."
+            )
+        if use_gpu:
+            # load the model on GPU
+            # if the GPU is not available or not correctly configured,
+            # it will rise an error
+            spacy.require_gpu()
+        self.spacy = load_spacy(
+            SPACY_LANGUAGE_MAPPER[language],
+            return_pos_tags,
+            return_lemmas,
+            return_deps,
+            split_on_spaces,
+        )
+        self.split_on_spaces = split_on_spaces
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs,
+    ) -> Union[List[Word], List[List[Word]]]:
+        """
+        Tokenize the input into single words using SpaCy models.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`List[List[Word]]`: The input text tokenized in single words.
+        Example::
+            >>> from ipa import SpacyTokenizer
+            >>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True)
+            >>> spacy_tokenizer("Mary sold the car to John.")
+        """
+        # check if input is batched or a single sample
+        is_batched = self.check_is_batched(texts, is_split_into_words)
+        if is_batched:
+            tokenized = self.tokenize_batch(texts)
+        else:
+            tokenized = self.tokenize(texts)
+        return tokenized
+    @overrides
+    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
+        if self.split_on_spaces:
+            if isinstance(text, str):
+                text = text.split(" ")
+            spaces = [True] * len(text)
+            text = Doc(self.spacy.vocab, words=text, spaces=spaces)
+        return self._clean_tokens(self.spacy(text))
+    @overrides
+    def tokenize_batch(
+        self, texts: Union[List[str], List[List[str]]]
+    ) -> List[List[Word]]:
+        if self.split_on_spaces:
+            if isinstance(texts[0], str):
+                texts = [text.split(" ") for text in texts]
+            spaces = [[True] * len(text) for text in texts]
+            texts = [
+                Doc(self.spacy.vocab, words=text, spaces=space)
+                for text, space in zip(texts, spaces)
+            ]
+        return [self._clean_tokens(tokens) for tokens in self.spacy.pipe(texts)]
+    @staticmethod
+    def _clean_tokens(tokens: Doc) -> List[Word]:
+        """
+        Converts spaCy tokens to :obj:`Word`.
+        Args:
+            tokens (:obj:`spacy.tokens.Doc`):
+                Tokens from SpaCy model.
+        Returns:
+            :obj:`List[Word]`: The SpaCy model output converted into :obj:`Word` objects.
+        """
+        words = [
+            Word(
+                token.text,
+                token.i,
+                token.idx,
+                token.idx + len(token),
+                token.lemma_,
+                token.pos_,
+                token.dep_,
+                token.head.i,
+            )
+            for token in tokens
+        ]
+        return words
+class WhitespaceSpacyTokenizer:
+    """Simple white space tokenizer for SpaCy."""
+    def __init__(self, vocab):
+        self.vocab = vocab
+    def __call__(self, text):
+        if isinstance(text, str):
+            words = text.split(" ")
+        elif isinstance(text, list):
+            words = text
+        else:
+            raise ValueError(
+                f"text must be either `str` or `list`, found: `{type(text)}`"
+            )
+        spaces = [True] * len(words)
+        return Doc(self.vocab, words=words, spaces=spaces)

relik/inference/data/tokenizers/whitespace_tokenizer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import re
+from typing import List, Union
+from overrides import overrides
+from relik.inference.data.objects import Word
+from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
+class WhitespaceTokenizer(BaseTokenizer):
+    """
+    A :obj:`Tokenizer` that splits the text on spaces.
+    """
+    def __init__(self):
+        super(WhitespaceTokenizer, self).__init__()
+        self.whitespace_regex = re.compile(r"\S+")
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs,
+    ) -> List[List[Word]]:
+        """
+        Tokenize the input into single words by splitting on spaces.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`List[List[Word]]`: The input text tokenized in single words.
+        Example::
+            >>> from nlp_preprocessing_wrappers import WhitespaceTokenizer
+            >>> whitespace_tokenizer = WhitespaceTokenizer()
+            >>> whitespace_tokenizer("Mary sold the car to John .")
+        """
+        # check if input is batched or a single sample
+        is_batched = self.check_is_batched(texts, is_split_into_words)
+        if is_batched:
+            tokenized = self.tokenize_batch(texts)
+        else:
+            tokenized = self.tokenize(texts)
+        return tokenized
+    @overrides
+    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
+        if not isinstance(text, (str, list)):
+            raise ValueError(
+                f"text must be either `str` or `list`, found: `{type(text)}`"
+            )
+        if isinstance(text, list):
+            text = " ".join(text)
+        return [
+            Word(t[0], i, start_char=t[1], end_char=t[2])
+            for i, t in enumerate(
+                (m.group(0), m.start(), m.end())
+                for m in self.whitespace_regex.finditer(text)
+            )
+        ]

relik/inference/data/window/__init__.py ADDED Viewed

File without changes

relik/inference/data/window/manager.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import collections
+import itertools
+from dataclasses import dataclass
+from typing import List, Optional, Set, Tuple
+from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
+from relik.reader.data.relik_reader_sample import RelikReaderSample
+@dataclass
+class Window:
+    doc_id: int
+    window_id: int
+    text: str
+    tokens: List[str]
+    doc_topic: Optional[str]
+    offset: int
+    token2char_start: dict
+    token2char_end: dict
+    window_candidates: Optional[List[str]] = None
+class WindowManager:
+    def __init__(self, tokenizer: BaseTokenizer) -> None:
+        self.tokenizer = tokenizer
+    def tokenize(self, document: str) -> Tuple[List[str], List[Tuple[int, int]]]:
+        tokenized_document = self.tokenizer(document)
+        tokens = []
+        tokens_char_mapping = []
+        for token in tokenized_document:
+            tokens.append(token.text)
+            tokens_char_mapping.append((token.start_char, token.end_char))
+        return tokens, tokens_char_mapping
+    def create_windows(
+        self,
+        document: str,
+        window_size: int,
+        stride: int,
+        doc_id: int = 0,
+        doc_topic: str = None,
+    ) -> List[RelikReaderSample]:
+        document_tokens, tokens_char_mapping = self.tokenize(document)
+        if doc_topic is None:
+            doc_topic = document_tokens[0] if len(document_tokens) > 0 else ""
+        document_windows = []
+        if len(document_tokens) <= window_size:
+            text = document
+            # relik_reader_sample = RelikReaderSample()
+            document_windows.append(
+                # Window(
+                RelikReaderSample(
+                    doc_id=doc_id,
+                    window_id=0,
+                    text=text,
+                    tokens=document_tokens,
+                    doc_topic=doc_topic,
+                    offset=0,
+                    token2char_start={
+                        str(i): tokens_char_mapping[i][0]
+                        for i in range(len(document_tokens))
+                    },
+                    token2char_end={
+                        str(i): tokens_char_mapping[i][1]
+                        for i in range(len(document_tokens))
+                    },
+                )
+            )
+        else:
+            for window_id, i in enumerate(range(0, len(document_tokens), stride)):
+                # if the last stride is smaller than the window size, then we can
+                # include more tokens form the previous window.
+                if i != 0 and i + window_size > len(document_tokens):
+                    overflowing_tokens = i + window_size - len(document_tokens)
+                    if overflowing_tokens >= stride:
+                        break
+                    i -= overflowing_tokens
+                involved_token_indices = list(
+                    range(i, min(i + window_size, len(document_tokens) - 1))
+                )
+                window_tokens = [document_tokens[j] for j in involved_token_indices]
+                window_text_start = tokens_char_mapping[involved_token_indices[0]][0]
+                window_text_end = tokens_char_mapping[involved_token_indices[-1]][1]
+                text = document[window_text_start:window_text_end]
+                document_windows.append(
+                    # Window(
+                    RelikReaderSample(
+                        # dict(
+                        doc_id=doc_id,
+                        window_id=window_id,
+                        text=text,
+                        tokens=window_tokens,
+                        doc_topic=doc_topic,
+                        offset=window_text_start,
+                        token2char_start={
+                            str(i): tokens_char_mapping[ti][0]
+                            for i, ti in enumerate(involved_token_indices)
+                        },
+                        token2char_end={
+                            str(i): tokens_char_mapping[ti][1]
+                            for i, ti in enumerate(involved_token_indices)
+                        },
+                        # )
+                    )
+                )
+        return document_windows
+    def merge_windows(
+        self, windows: List[RelikReaderSample]
+    ) -> List[RelikReaderSample]:
+        windows_by_doc_id = collections.defaultdict(list)
+        for window in windows:
+            windows_by_doc_id[window.doc_id].append(window)
+        merged_window_by_doc = {
+            doc_id: self.merge_doc_windows(doc_windows)
+            for doc_id, doc_windows in windows_by_doc_id.items()
+        }
+        return list(merged_window_by_doc.values())
+    def merge_doc_windows(self, windows: List[RelikReaderSample]) -> RelikReaderSample:
+        if len(windows) == 1:
+            return windows[0]
+        if len(windows) > 0 and getattr(windows[0], "offset", None) is not None:
+            windows = sorted(windows, key=(lambda x: x.offset))
+        window_accumulator = windows[0]
+        for next_window in windows[1:]:
+            window_accumulator = self._merge_window_pair(
+                window_accumulator, next_window
+            )
+        return window_accumulator
+    def _merge_tokens(
+        self, window1: RelikReaderSample, window2: RelikReaderSample
+    ) -> Tuple[list, dict, dict]:
+        w1_tokens = window1.tokens[1:-1]
+        w2_tokens = window2.tokens[1:-1]
+        # find intersection
+        tokens_intersection = None
+        for k in reversed(range(1, len(w1_tokens))):
+            if w1_tokens[-k:] == w2_tokens[:k]:
+                tokens_intersection = k
+                break
+        assert tokens_intersection is not None, (
+            f"{window1.doc_id} - {window1.sent_id} - {window1.offset}"
+            + f" {window2.doc_id} - {window2.sent_id} - {window2.offset}\n"
+            + f"w1 tokens: {w1_tokens}\n"
+            + f"w2 tokens: {w2_tokens}\n"
+        )
+        final_tokens = (
+            [window1.tokens[0]]  # CLS
+            + w1_tokens
+            + w2_tokens[tokens_intersection:]
+            + [window1.tokens[-1]]  # SEP
+        )
+        w2_starting_offset = len(w1_tokens) - tokens_intersection
+        def merge_char_mapping(t2c1: dict, t2c2: dict) -> dict:
+            final_t2c = dict()
+            final_t2c.update(t2c1)
+            for t, c in t2c2.items():
+                t = int(t)
+                if t < tokens_intersection:
+                    continue
+                final_t2c[str(t + w2_starting_offset)] = c
+            return final_t2c
+        return (
+            final_tokens,
+            merge_char_mapping(window1.token2char_start, window2.token2char_start),
+            merge_char_mapping(window1.token2char_end, window2.token2char_end),
+        )
+    def _merge_span_annotation(
+        self, span_annotation1: List[list], span_annotation2: List[list]
+    ) -> List[list]:
+        uniq_store = set()
+        final_span_annotation_store = []
+        for span_annotation in itertools.chain(span_annotation1, span_annotation2):
+            span_annotation_id = tuple(span_annotation)
+            if span_annotation_id not in uniq_store:
+                uniq_store.add(span_annotation_id)
+                final_span_annotation_store.append(span_annotation)
+        return sorted(final_span_annotation_store, key=lambda x: x[0])
+    def _merge_predictions(
+        self,
+        window1: RelikReaderSample,
+        window2: RelikReaderSample,
+    ) -> Tuple[Set[Tuple[int, int, str]], dict]:
+        merged_predictions = window1.predicted_window_labels_chars.union(
+            window2.predicted_window_labels_chars
+        )
+        span_title_probabilities = dict()
+        # probabilities
+        for span_prediction, predicted_probs in itertools.chain(
+            window1.probs_window_labels_chars.items(),
+            window2.probs_window_labels_chars.items(),
+        ):
+            if span_prediction not in span_title_probabilities:
+                span_title_probabilities[span_prediction] = predicted_probs
+        return merged_predictions, span_title_probabilities
+    def _merge_window_pair(
+        self,
+        window1: RelikReaderSample,
+        window2: RelikReaderSample,
+    ) -> RelikReaderSample:
+        merging_output = dict()
+        if getattr(window1, "doc_id", None) is not None:
+            assert window1.doc_id == window2.doc_id
+        if getattr(window1, "offset", None) is not None:
+            assert (
+                window1.offset < window2.offset
+            ), f"window 2 offset ({window2.offset}) is smaller that window 1 offset({window1.offset})"
+        merging_output["doc_id"] = window1.doc_id
+        merging_output["offset"] = window2.offset
+        m_tokens, m_token2char_start, m_token2char_end = self._merge_tokens(
+            window1, window2
+        )
+        window_labels = None
+        if getattr(window1, "window_labels", None) is not None:
+            window_labels = self._merge_span_annotation(
+                window1.window_labels, window2.window_labels
+            )
+        (
+            predicted_window_labels_chars,
+            probs_window_labels_chars,
+        ) = self._merge_predictions(
+            window1,
+            window2,
+        )
+        merging_output.update(
+            dict(
+                tokens=m_tokens,
+                token2char_start=m_token2char_start,
+                token2char_end=m_token2char_end,
+                window_labels=window_labels,
+                predicted_window_labels_chars=predicted_window_labels_chars,
+                probs_window_labels_chars=probs_window_labels_chars,
+            )
+        )
+        return RelikReaderSample(**merging_output)

relik/inference/gerbil.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import argparse
+import json
+import os
+import re
+import sys
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Iterator, List, Optional, Tuple
+from relik.inference.annotator import Relik
+from relik.inference.data.objects import RelikOutput
+# sys.path += ['../']
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
+import logging
+logger = logging.getLogger(__name__)
+class GerbilAlbyManager:
+    def __init__(
+        self,
+        annotator: Optional[Relik] = None,
+        response_logger_dir: Optional[str] = None,
+    ) -> None:
+        self.annotator = annotator
+        self.response_logger_dir = response_logger_dir
+        self.predictions_counter = 0
+        self.labels_mapping = None
+    def annotate(self, document: str):
+        relik_output: RelikOutput = self.annotator(document)
+        annotations = [(ss, se, l) for ss, se, l, _ in relik_output.labels]
+        if self.labels_mapping is not None:
+            return [
+                (ss, se, self.labels_mapping.get(l, l)) for ss, se, l in annotations
+            ]
+        return annotations
+    def set_mapping_file(self, mapping_file_path: str):
+        with open(mapping_file_path) as f:
+            labels_mapping = json.load(f)
+        self.labels_mapping = {v: k for k, v in labels_mapping.items()}
+    def write_response_bundle(
+        self,
+        document: str,
+        new_document: str,
+        annotations: list,
+        mapped_annotations: list,
+    ) -> None:
+        if self.response_logger_dir is None:
+            return
+        if not os.path.isdir(self.response_logger_dir):
+            os.mkdir(self.response_logger_dir)
+        with open(
+            f"{self.response_logger_dir}/{self.predictions_counter}.json", "w"
+        ) as f:
+            out_json_obj = dict(
+                document=document,
+                new_document=new_document,
+                annotations=annotations,
+                mapped_annotations=mapped_annotations,
+            )
+            out_json_obj["span_annotations"] = [
+                (ss, se, document[ss:se], label) for (ss, se, label) in annotations
+            ]
+            out_json_obj["span_mapped_annotations"] = [
+                (ss, se, new_document[ss:se], label)
+                for (ss, se, label) in mapped_annotations
+            ]
+            json.dump(out_json_obj, f, indent=2)
+        self.predictions_counter += 1
+manager = GerbilAlbyManager()
+def preprocess_document(document: str) -> Tuple[str, List[Tuple[int, int]]]:
+    pattern_subs = {
+        "-LPR- ": " (",
+        "-RPR-": ")",
+        "\n\n": "\n",
+        "-LRB-": "(",
+        "-RRB-": ")",
+        '","': ",",
+    }
+    document_acc = document
+    curr_offset = 0
+    char2offset = []
+    matchings = re.finditer("({})".format("|".join(pattern_subs)), document)
+    for span_matching in sorted(matchings, key=lambda x: x.span()[0]):
+        span_start, span_end = span_matching.span()
+        span_start -= curr_offset
+        span_end -= curr_offset
+        span_text = document_acc[span_start:span_end]
+        span_sub = pattern_subs[span_text]
+        document_acc = document_acc[:span_start] + span_sub + document_acc[span_end:]
+        offset = len(span_text) - len(span_sub)
+        curr_offset += offset
+        char2offset.append((span_start + len(span_sub), curr_offset))
+    return document_acc, char2offset
+def map_back_annotations(
+    annotations: List[Tuple[int, int, str]], char_mapping: List[Tuple[int, int]]
+) -> Iterator[Tuple[int, int, str]]:
+    def map_char(char_idx: int) -> int:
+        current_offset = 0
+        for offset_idx, offset_value in char_mapping:
+            if char_idx >= offset_idx:
+                current_offset = offset_value
+            else:
+                break
+        return char_idx + current_offset
+    for ss, se, label in annotations:
+        yield map_char(ss), map_char(se), label
+def annotate(document: str) -> List[Tuple[int, int, str]]:
+    new_document, mapping = preprocess_document(document)
+    logger.info("Mapping: " + str(mapping))
+    logger.info("Document: " + str(document))
+    annotations = [
+        (cs, ce, label.replace(" ", "_"))
+        for cs, ce, label in manager.annotate(new_document)
+    ]
+    logger.info("New document: " + str(new_document))
+    mapped_annotations = (
+        list(map_back_annotations(annotations, mapping))
+        if len(mapping) > 0
+        else annotations
+    )
+    logger.info(
+        "Annotations: "
+        + str([(ss, se, document[ss:se], ann) for ss, se, ann in mapped_annotations])
+    )
+    manager.write_response_bundle(
+        document, new_document, mapped_annotations, annotations
+    )
+    if not all(
+        [
+            new_document[ss:se] == document[mss:mse]
+            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
+        ]
+    ):
+        diff_mappings = [
+            (new_document[ss:se], document[mss:mse])
+            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
+        ]
+        return None
+    assert all(
+        [
+            document[mss:mse] == new_document[ss:se]
+            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
+        ]
+    ), (mapped_annotations, annotations)
+    return [(cs, ce - cs, label) for cs, ce, label in mapped_annotations]
+class GetHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        content_length = int(self.headers["Content-Length"])
+        post_data = self.rfile.read(content_length)
+        self.send_response(200)
+        self.end_headers()
+        doc_text = read_json(post_data)
+        # try:
+        response = annotate(doc_text)
+        self.wfile.write(bytes(json.dumps(response), "utf-8"))
+        return
+def read_json(post_data):
+    data = json.loads(post_data.decode("utf-8"))
+    # logger.info("received data:", data)
+    text = data["text"]
+    # spans = [(int(j["start"]), int(j["length"])) for j in data["spans"]]
+    return text
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--relik-model-name", required=True)
+    parser.add_argument("--responses-log-dir")
+    parser.add_argument("--log-file", default="logs/logging.txt")
+    parser.add_argument("--mapping-file")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # init manager
+    manager.response_logger_dir = args.responses_log_dir
+    # manager.annotator = Relik.from_pretrained(args.relik_model_name)
+    print("Debugging, not using you relik model but an hardcoded one.")
+    manager.annotator = Relik(
+        question_encoder="riccorl/relik-retriever-aida-blink-pretrain-omniencoder",
+        document_index="riccorl/index-relik-retriever-aida-blink-pretrain-omniencoder",
+        reader="relik/reader/models/relik-reader-deberta-base-new-data",
+        window_size=32,
+        window_stride=16,
+        candidates_preprocessing_fn=(lambda x: x.split("<def>")[0].strip()),
+    )
+    if args.mapping_file is not None:
+        manager.set_mapping_file(args.mapping_file)
+    port = 6654
+    server = HTTPServer(("localhost", port), GetHandler)
+    logger.info(f"Starting server at http://localhost:{port}")
+    # Create a file handler and set its level
+    file_handler = logging.FileHandler(args.log_file)
+    file_handler.setLevel(logging.DEBUG)
+    # Create a log formatter and set it on the handler
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    # Add the file handler to the logger
+    logger.addHandler(file_handler)
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        exit(0)
+if __name__ == "__main__":
+    main()

relik/inference/preprocessing.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def wikipedia_title_and_openings_preprocessing(
+    wikipedia_title_and_openings: str, sepator: str = " <def>"
+):
+    return wikipedia_title_and_openings.split(sepator, 1)[0]

relik/inference/serve/__init__.py ADDED Viewed

File without changes

relik/inference/serve/backend/__init__.py ADDED Viewed

File without changes

relik/inference/serve/backend/relik.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import logging
+from pathlib import Path
+from typing import List, Optional, Union
+from relik.common.utils import is_package_available
+from relik.inference.annotator import Relik
+if not is_package_available("fastapi"):
+    raise ImportError(
+        "FastAPI is not installed. Please install FastAPI with `pip install relik[serve]`."
+    )
+from fastapi import FastAPI, HTTPException
+if not is_package_available("ray"):
+    raise ImportError(
+        "Ray is not installed. Please install Ray with `pip install relik[serve]`."
+    )
+from ray import serve
+from relik.common.log import get_logger
+from relik.inference.serve.backend.utils import (
+    RayParameterManager,
+    ServerParameterManager,
+)
+from relik.retriever.data.utils import batch_generator
+logger = get_logger(__name__, level=logging.INFO)
+VERSION = {}  # type: ignore
+with open(
+    Path(__file__).parent.parent.parent.parent / "version.py", "r"
+) as version_file:
+    exec(version_file.read(), VERSION)
+# Env variables for server
+SERVER_MANAGER = ServerParameterManager()
+RAY_MANAGER = RayParameterManager()
+app = FastAPI(
+    title="ReLiK",
+    version=VERSION["VERSION"],
+    description="ReLiK REST API",
+)
+@serve.deployment(
+    ray_actor_options={
+        "num_gpus": RAY_MANAGER.num_gpus
+        if (
+            SERVER_MANAGER.retriver_device == "cuda"
+            or SERVER_MANAGER.reader_device == "cuda"
+        )
+        else 0
+    },
+    autoscaling_config={
+        "min_replicas": RAY_MANAGER.min_replicas,
+        "max_replicas": RAY_MANAGER.max_replicas,
+    },
+)
+@serve.ingress(app)
+class RelikServer:
+    def __init__(
+        self,
+        question_encoder: str,
+        document_index: str,
+        passage_encoder: Optional[str] = None,
+        reader_encoder: Optional[str] = None,
+        top_k: int = 100,
+        retriver_device: str = "cpu",
+        reader_device: str = "cpu",
+        index_device: Optional[str] = None,
+        precision: int = 32,
+        index_precision: Optional[int] = None,
+        use_faiss: bool = False,
+        window_batch_size: int = 32,
+        window_size: int = 32,
+        window_stride: int = 16,
+        split_on_spaces: bool = False,
+    ):
+        # parameters
+        self.question_encoder = question_encoder
+        self.passage_encoder = passage_encoder
+        self.reader_encoder = reader_encoder
+        self.document_index = document_index
+        self.top_k = top_k
+        self.retriver_device = retriver_device
+        self.index_device = index_device or retriver_device
+        self.reader_device = reader_device
+        self.precision = precision
+        self.index_precision = index_precision or precision
+        self.use_faiss = use_faiss
+        self.window_batch_size = window_batch_size
+        self.window_size = window_size
+        self.window_stride = window_stride
+        self.split_on_spaces = split_on_spaces
+        # log stuff for debugging
+        logger.info("Initializing RelikServer with parameters:")
+        logger.info(f"QUESTION_ENCODER: {self.question_encoder}")
+        logger.info(f"PASSAGE_ENCODER: {self.passage_encoder}")
+        logger.info(f"READER_ENCODER: {self.reader_encoder}")
+        logger.info(f"DOCUMENT_INDEX: {self.document_index}")
+        logger.info(f"TOP_K: {self.top_k}")
+        logger.info(f"RETRIEVER_DEVICE: {self.retriver_device}")
+        logger.info(f"READER_DEVICE: {self.reader_device}")
+        logger.info(f"INDEX_DEVICE: {self.index_device}")
+        logger.info(f"PRECISION: {self.precision}")
+        logger.info(f"INDEX_PRECISION: {self.index_precision}")
+        logger.info(f"WINDOW_BATCH_SIZE: {self.window_batch_size}")
+        logger.info(f"SPLIT_ON_SPACES: {self.split_on_spaces}")
+        self.relik = Relik(
+            question_encoder=self.question_encoder,
+            passage_encoder=self.passage_encoder,
+            document_index=self.document_index,
+            reader=self.reader_encoder,
+            retriever_device=self.retriver_device,
+            document_index_device=self.index_device,
+            reader_device=self.reader_device,
+            retriever_precision=self.precision,
+            document_index_precision=self.index_precision,
+            reader_precision=self.precision,
+        )
+    # @serve.batch()
+    async def handle_batch(self, documents: List[str]) -> List:
+        return self.relik(
+            documents,
+            top_k=self.top_k,
+            window_size=self.window_size,
+            window_stride=self.window_stride,
+            batch_size=self.window_batch_size,
+        )
+    @app.post("/api/entities")
+    async def entities_endpoint(
+        self,
+        documents: Union[str, List[str]],
+    ):
+        try:
+            # normalize input
+            if isinstance(documents, str):
+                documents = [documents]
+            if document_topics is not None:
+                if isinstance(document_topics, str):
+                    document_topics = [document_topics]
+                assert len(documents) == len(document_topics)
+            # get predictions for the retriever
+            return await self.handle_batch(documents, document_topics)
+        except Exception as e:
+            # log the entire stack trace
+            logger.exception(e)
+            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
+    @app.post("/api/gerbil")
+    async def gerbil_endpoint(self, documents: Union[str, List[str]]):
+        try:
+            # normalize input
+            if isinstance(documents, str):
+                documents = [documents]
+            # output list
+            windows_passages = []
+            # split documents into windows
+            document_windows = [
+                window
+                for doc_id, document in enumerate(documents)
+                for window in self.window_manager(
+                    self.tokenizer,
+                    document,
+                    window_size=self.window_size,
+                    stride=self.window_stride,
+                    doc_id=doc_id,
+                )
+            ]
+            # get text and topic from document windows and create new list
+            model_inputs = [
+                (window.text, window.doc_topic) for window in document_windows
+            ]
+            # batch generator
+            for batch in batch_generator(
+                model_inputs, batch_size=self.window_batch_size
+            ):
+                text, text_pair = zip(*batch)
+                batch_predictions = await self.handle_batch_retriever(text, text_pair)
+                windows_passages.extend(
+                    [
+                        [p.label for p in predictions]
+                        for predictions in batch_predictions
+                    ]
+                )
+            # add passage to document windows
+            for window, passages in zip(document_windows, windows_passages):
+                # clean up passages (remove everything after first <def> tag if present)
+                passages = [c.split(" <def>", 1)[0] for c in passages]
+                window.window_candidates = passages
+            # return document windows
+            return document_windows
+        except Exception as e:
+            # log the entire stack trace
+            logger.exception(e)
+            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
+server = RelikServer.bind(**vars(SERVER_MANAGER))

relik/inference/serve/backend/retriever.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import logging
+from pathlib import Path
+from typing import List, Optional, Union
+from relik.common.utils import is_package_available
+if not is_package_available("fastapi"):
+    raise ImportError(
+        "FastAPI is not installed. Please install FastAPI with `pip install relik[serve]`."
+    )
+from fastapi import FastAPI, HTTPException
+if not is_package_available("ray"):
+    raise ImportError(
+        "Ray is not installed. Please install Ray with `pip install relik[serve]`."
+    )
+from ray import serve
+from relik.common.log import get_logger
+from relik.inference.data.tokenizers import SpacyTokenizer, WhitespaceTokenizer
+from relik.inference.data.window.manager import WindowManager
+from relik.inference.serve.backend.utils import (
+    RayParameterManager,
+    ServerParameterManager,
+)
+from relik.retriever.data.utils import batch_generator
+from relik.retriever.pytorch_modules import GoldenRetriever
+logger = get_logger(__name__, level=logging.INFO)
+VERSION = {}  # type: ignore
+with open(Path(__file__).parent.parent.parent / "version.py", "r") as version_file:
+    exec(version_file.read(), VERSION)
+# Env variables for server
+SERVER_MANAGER = ServerParameterManager()
+RAY_MANAGER = RayParameterManager()
+app = FastAPI(
+    title="Golden Retriever",
+    version=VERSION["VERSION"],
+    description="Golden Retriever REST API",
+)
+@serve.deployment(
+    ray_actor_options={
+        "num_gpus": RAY_MANAGER.num_gpus if SERVER_MANAGER.device == "cuda" else 0
+    },
+    autoscaling_config={
+        "min_replicas": RAY_MANAGER.min_replicas,
+        "max_replicas": RAY_MANAGER.max_replicas,
+    },
+)
+@serve.ingress(app)
+class GoldenRetrieverServer:
+    def __init__(
+        self,
+        question_encoder: str,
+        document_index: str,
+        passage_encoder: Optional[str] = None,
+        top_k: int = 100,
+        device: str = "cpu",
+        index_device: Optional[str] = None,
+        precision: int = 32,
+        index_precision: Optional[int] = None,
+        use_faiss: bool = False,
+        window_batch_size: int = 32,
+        window_size: int = 32,
+        window_stride: int = 16,
+        split_on_spaces: bool = False,
+    ):
+        # parameters
+        self.question_encoder = question_encoder
+        self.passage_encoder = passage_encoder
+        self.document_index = document_index
+        self.top_k = top_k
+        self.device = device
+        self.index_device = index_device or device
+        self.precision = precision
+        self.index_precision = index_precision or precision
+        self.use_faiss = use_faiss
+        self.window_batch_size = window_batch_size
+        self.window_size = window_size
+        self.window_stride = window_stride
+        self.split_on_spaces = split_on_spaces
+        # log stuff for debugging
+        logger.info("Initializing GoldenRetrieverServer with parameters:")
+        logger.info(f"QUESTION_ENCODER: {self.question_encoder}")
+        logger.info(f"PASSAGE_ENCODER: {self.passage_encoder}")
+        logger.info(f"DOCUMENT_INDEX: {self.document_index}")
+        logger.info(f"TOP_K: {self.top_k}")
+        logger.info(f"DEVICE: {self.device}")
+        logger.info(f"INDEX_DEVICE: {self.index_device}")
+        logger.info(f"PRECISION: {self.precision}")
+        logger.info(f"INDEX_PRECISION: {self.index_precision}")
+        logger.info(f"WINDOW_BATCH_SIZE: {self.window_batch_size}")
+        logger.info(f"SPLIT_ON_SPACES: {self.split_on_spaces}")
+        self.retriever = GoldenRetriever(
+            question_encoder=self.question_encoder,
+            passage_encoder=self.passage_encoder,
+            document_index=self.document_index,
+            device=self.device,
+            index_device=self.index_device,
+            index_precision=self.index_precision,
+        )
+        self.retriever.eval()
+        if self.split_on_spaces:
+            logger.info("Using WhitespaceTokenizer")
+            self.tokenizer = WhitespaceTokenizer()
+            # logger.info("Using RegexTokenizer")
+            # self.tokenizer = RegexTokenizer()
+        else:
+            logger.info("Using SpacyTokenizer")
+            self.tokenizer = SpacyTokenizer(language="en")
+        self.window_manager = WindowManager(tokenizer=self.tokenizer)
+    # @serve.batch()
+    async def handle_batch(
+        self, documents: List[str], document_topics: List[str]
+    ) -> List:
+        return self.retriever.retrieve(
+            documents, text_pair=document_topics, k=self.top_k, precision=self.precision
+        )
+    @app.post("/api/retrieve")
+    async def retrieve_endpoint(
+        self,
+        documents: Union[str, List[str]],
+        document_topics: Optional[Union[str, List[str]]] = None,
+    ):
+        try:
+            # normalize input
+            if isinstance(documents, str):
+                documents = [documents]
+            if document_topics is not None:
+                if isinstance(document_topics, str):
+                    document_topics = [document_topics]
+                assert len(documents) == len(document_topics)
+            # get predictions
+            return await self.handle_batch(documents, document_topics)
+        except Exception as e:
+            # log the entire stack trace
+            logger.exception(e)
+            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
+    @app.post("/api/gerbil")
+    async def gerbil_endpoint(self, documents: Union[str, List[str]]):
+        try:
+            # normalize input
+            if isinstance(documents, str):
+                documents = [documents]
+            # output list
+            windows_passages = []
+            # split documents into windows
+            document_windows = [
+                window
+                for doc_id, document in enumerate(documents)
+                for window in self.window_manager(
+                    self.tokenizer,
+                    document,
+                    window_size=self.window_size,
+                    stride=self.window_stride,
+                    doc_id=doc_id,
+                )
+            ]
+            # get text and topic from document windows and create new list
+            model_inputs = [
+                (window.text, window.doc_topic) for window in document_windows
+            ]
+            # batch generator
+            for batch in batch_generator(
+                model_inputs, batch_size=self.window_batch_size
+            ):
+                text, text_pair = zip(*batch)
+                batch_predictions = await self.handle_batch(text, text_pair)
+                windows_passages.extend(
+                    [
+                        [p.label for p in predictions]
+                        for predictions in batch_predictions
+                    ]
+                )
+            # add passage to document windows
+            for window, passages in zip(document_windows, windows_passages):
+                # clean up passages (remove everything after first <def> tag if present)
+                passages = [c.split(" <def>", 1)[0] for c in passages]
+                window.window_candidates = passages
+            # return document windows
+            return document_windows
+        except Exception as e:
+            # log the entire stack trace
+            logger.exception(e)
+            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
+server = GoldenRetrieverServer.bind(**vars(SERVER_MANAGER))

relik/inference/serve/backend/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+from dataclasses import dataclass
+from typing import Union
+@dataclass
+class ServerParameterManager:
+    retriver_device: str = os.environ.get("RETRIEVER_DEVICE", "cpu")
+    reader_device: str = os.environ.get("READER_DEVICE", "cpu")
+    index_device: str = os.environ.get("INDEX_DEVICE", retriver_device)
+    precision: Union[str, int] = os.environ.get("PRECISION", "fp32")
+    index_precision: Union[str, int] = os.environ.get("INDEX_PRECISION", precision)
+    question_encoder: str = os.environ.get("QUESTION_ENCODER", None)
+    passage_encoder: str = os.environ.get("PASSAGE_ENCODER", None)
+    document_index: str = os.environ.get("DOCUMENT_INDEX", None)
+    reader_encoder: str = os.environ.get("READER_ENCODER", None)
+    top_k: int = int(os.environ.get("TOP_K", 100))
+    use_faiss: bool = os.environ.get("USE_FAISS", False)
+    window_batch_size: int = int(os.environ.get("WINDOW_BATCH_SIZE", 32))
+    window_size: int = int(os.environ.get("WINDOW_SIZE", 32))
+    window_stride: int = int(os.environ.get("WINDOW_SIZE", 16))
+    split_on_spaces: bool = os.environ.get("SPLIT_ON_SPACES", False)
+class RayParameterManager:
+    def __init__(self) -> None:
+        self.num_gpus = int(os.environ.get("NUM_GPUS", 1))
+        self.min_replicas = int(os.environ.get("MIN_REPLICAS", 1))
+        self.max_replicas = int(os.environ.get("MAX_REPLICAS", 1))

relik/inference/serve/frontend/__init__.py ADDED Viewed

File without changes

relik/inference/serve/frontend/relik.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import os
+import re
+import time
+from pathlib import Path
+import requests
+import streamlit as st
+from spacy import displacy
+from streamlit_extras.badges import badge
+from streamlit_extras.stylable_container import stylable_container
+RELIK = os.getenv("RELIK", "localhost:8000/api/entities")
+import random
+def get_random_color(ents):
+    colors = {}
+    random_colors = generate_pastel_colors(len(ents))
+    for ent in ents:
+        colors[ent] = random_colors.pop(random.randint(0, len(random_colors) - 1))
+    return colors
+def floatrange(start, stop, steps):
+    if int(steps) == 1:
+        return [stop]
+    return [
+        start + float(i) * (stop - start) / (float(steps) - 1) for i in range(steps)
+    ]
+def hsl_to_rgb(h, s, l):
+    def hue_2_rgb(v1, v2, v_h):
+        while v_h < 0.0:
+            v_h += 1.0
+        while v_h > 1.0:
+            v_h -= 1.0
+        if 6 * v_h < 1.0:
+            return v1 + (v2 - v1) * 6.0 * v_h
+        if 2 * v_h < 1.0:
+            return v2
+        if 3 * v_h < 2.0:
+            return v1 + (v2 - v1) * ((2.0 / 3.0) - v_h) * 6.0
+        return v1
+    # if not (0 <= s <= 1): raise ValueError, "s (saturation) parameter must be between 0 and 1."
+    # if not (0 <= l <= 1): raise ValueError, "l (lightness) parameter must be between 0 and 1."
+    r, b, g = (l * 255,) * 3
+    if s != 0.0:
+        if l < 0.5:
+            var_2 = l * (1.0 + s)
+        else:
+            var_2 = (l + s) - (s * l)
+        var_1 = 2.0 * l - var_2
+        r = 255 * hue_2_rgb(var_1, var_2, h + (1.0 / 3.0))
+        g = 255 * hue_2_rgb(var_1, var_2, h)
+        b = 255 * hue_2_rgb(var_1, var_2, h - (1.0 / 3.0))
+    return int(round(r)), int(round(g)), int(round(b))
+def generate_pastel_colors(n):
+    """Return different pastel colours.
+    Input:
+        n (integer) : The number of colors to return
+    Output:
+        A list of colors in HTML notation (eg.['#cce0ff', '#ffcccc', '#ccffe0', '#f5ccff', '#f5ffcc'])
+    Example:
+        >>> print generate_pastel_colors(5)
+        ['#cce0ff', '#f5ccff', '#ffcccc', '#f5ffcc', '#ccffe0']
+    """
+    if n == 0:
+        return []
+    # To generate colors, we use the HSL colorspace (see http://en.wikipedia.org/wiki/HSL_color_space)
+    start_hue = 0.6  # 0=red    1/3=0.333=green   2/3=0.666=blue
+    saturation = 1.0
+    lightness = 0.8
+    # We take points around the chromatic circle (hue):
+    # (Note: we generate n+1 colors, then drop the last one ([:-1]) because
+    # it equals the first one (hue 0 = hue 1))
+    return [
+        "#%02x%02x%02x" % hsl_to_rgb(hue, saturation, lightness)
+        for hue in floatrange(start_hue, start_hue + 1, n + 1)
+    ][:-1]
+def set_sidebar(css):
+    white_link_wrapper = "<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css'><a href='{}'>{}</a>"
+    with st.sidebar:
+        st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
+        st.image(
+            "http://nlp.uniroma1.it/static/website/sapienza-nlp-logo-wh.svg",
+            use_column_width=True,
+        )
+        st.markdown("## ReLiK")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("#", "<i class='fa-solid fa-file'></i>&nbsp; Paper")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP/relik", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://hub.docker.com/repository/docker/sapienzanlp/relik", "<i class='fa-brands fa-docker'></i>&nbsp; Docker Hub")}
+                """,
+            unsafe_allow_html=True,
+        )
+        st.markdown("## Sapienza NLP")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("https://nlp.uniroma1.it", "<i class='fa-solid fa-globe'></i>&nbsp; Webpage")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://twitter.com/SapienzaNLP", "<i class='fa-brands fa-twitter'></i>&nbsp; Twitter")}
+                - {white_link_wrapper.format("https://www.linkedin.com/company/79434450", "<i class='fa-brands fa-linkedin'></i>&nbsp; LinkedIn")}
+                """,
+            unsafe_allow_html=True,
+        )
+def get_el_annotations(response):
+    # swap labels key with ents
+    response["ents"] = response.pop("labels")
+    label_in_text = set(l["label"] for l in response["ents"])
+    options = {"ents": label_in_text, "colors": get_random_color(label_in_text)}
+    return response, options
+def set_intro(css):
+    # intro
+    st.markdown("# ReLik")
+    st.markdown(
+        "### Retrieve, Read and LinK: Fast and Accurate Entity Linking and Relation Extraction on an Academic Budget"
+    )
+    # st.markdown(
+    #     "This is a front-end for the paper [Universal Semantic Annotator: the First Unified API "
+    #     "for WSD, SRL and Semantic Parsing](https://www.researchgate.net/publication/360671045_Universal_Semantic_Annotator_the_First_Unified_API_for_WSD_SRL_and_Semantic_Parsing), which will be presented at LREC 2022 by "
+    #     "[Riccardo Orlando](https://riccorl.github.io), [Simone Conia](https://c-simone.github.io/), "
+    #     "[Stefano Faralli](https://corsidilaurea.uniroma1.it/it/users/stefanofaralliuniroma1it), and [Roberto Navigli](https://www.diag.uniroma1.it/navigli/)."
+    # )
+    badge(type="github", name="sapienzanlp/relik")
+    badge(type="pypi", name="relik")
+def run_client():
+    with open(Path(__file__).parent / "style.css") as f:
+        css = f.read()
+    st.set_page_config(
+        page_title="ReLik",
+        page_icon="🦮",
+        layout="wide",
+    )
+    set_sidebar(css)
+    set_intro(css)
+    # text input
+    text = st.text_area(
+        "Enter Text Below:",
+        value="Obama went to Rome for a quick vacation.",
+        height=200,
+        max_chars=500,
+    )
+    with stylable_container(
+        key="annotate_button",
+        css_styles="""
+            button {
+                background-color: #802433;
+                color: white;
+                border-radius: 25px;
+            }
+            """,
+    ):
+        submit = st.button("Annotate")
+    # submit = st.button("Run")
+    # ReLik API call
+    if submit:
+        text = text.strip()
+        if text:
+            st.markdown("####")
+            st.markdown("#### Entity Linking")
+            with st.spinner(text="In progress"):
+                response = requests.post(RELIK, json=text)
+                if response.status_code != 200:
+                    st.error("Error: {}".format(response.status_code))
+                else:
+                    response = response.json()
+                    # Entity Linking
+                    # with stylable_container(
+                    #     key="container_with_border",
+                    #     css_styles="""
+                    #         {
+                    #             border: 1px solid rgba(49, 51, 63, 0.2);
+                    #             border-radius: 0.5rem;
+                    #             padding: 0.5rem;
+                    #             padding-bottom: 2rem;
+                    #         }
+                    #         """,
+                    # ):
+                    # st.markdown("##")
+                    dict_of_ents, options = get_el_annotations(response=response)
+                    display = displacy.render(
+                        dict_of_ents, manual=True, style="ent", options=options
+                    )
+                    display = display.replace("\n", " ")
+                    # wsd_display = re.sub(
+                    #     r"(wiki::\d+\w)",
+                    #     r"<a href='https://babelnet.org/synset?id=\g<1>&orig=\g<1>&lang={}'>\g<1></a>".format(
+                    #         language.upper()
+                    #     ),
+                    #     wsd_display,
+                    # )
+                    with st.container():
+                        st.write(display, unsafe_allow_html=True)
+                    st.markdown("####")
+                    st.markdown("#### Relation Extraction")
+                    with st.container():
+                        st.write("Coming :)", unsafe_allow_html=True)
+        else:
+            st.error("Please enter some text.")
+if __name__ == "__main__":
+    run_client()

relik/inference/serve/frontend/style.css ADDED Viewed

	@@ -0,0 +1,33 @@

+/* Sidebar */
+.eczjsme11 {
+    background-color: #802433;
+}
+.st-emotion-cache-10oheav h2 {
+    color: white;
+}
+.st-emotion-cache-10oheav li {
+    color: white;
+}
+/* Main */
+a:link {
+    text-decoration: none;
+    color: white;
+}
+a:visited {
+    text-decoration: none;
+    color: white;
+}
+a:hover {
+    text-decoration: none;
+    color: rgba(255, 255, 255, 0.871);
+}
+a:active {
+    text-decoration: none;
+    color: white;
+}

relik/reader/__init__.py ADDED Viewed

File without changes

relik/reader/conf/config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# Required to make the "experiments" dir the default one for the output of the models
+hydra:
+  run:
+    dir: ./experiments/${model_name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+model_name: relik-reader-deberta-base  # used to name the model in wandb and output dir
+project_name: relik-reader # used to name the project in wandb
+defaults:
+  - _self_
+  - training: base
+  - model: base
+  - data: base

relik/reader/conf/data/base.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+train_dataset_path: "relik/reader/data/train.jsonl"
+val_dataset_path: "relik/reader/data/testa.jsonl"
+train_dataset:
+  _target_: "relik.reader.relik_reader_data.RelikDataset"
+  transformer_model: "${model.model.transformer_model}"
+  materialize_samples: False
+  shuffle_candidates: 0.5
+  random_drop_gold_candidates: 0.05
+  noise_param: 0.0
+  for_inference: False
+  tokens_per_batch: 4096
+  special_symbols: null
+val_dataset:
+  _target_: "relik.reader.relik_reader_data.RelikDataset"
+  transformer_model: "${model.model.transformer_model}"
+  materialize_samples: False
+  shuffle_candidates: False
+  for_inference: True
+  special_symbols: null

relik/reader/conf/data/re.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+train_dataset_path: "relik/reader/data/nyt-alby+/train.jsonl"
+val_dataset_path: "relik/reader/data/nyt-alby+/valid.jsonl"
+test_dataset_path: "relik/reader/data/nyt-alby+/test.jsonl"
+relations_definitions:
+  /people/person/nationality: "nationality"
+  /sports/sports_team/location: "sports team location"
+  /location/country/administrative_divisions: "administrative divisions"
+  /business/company/major_shareholders: "shareholders"
+  /people/ethnicity/people: "ethnicity"
+  /people/ethnicity/geographic_distribution: "geographic distributi6on"
+  /business/company_shareholder/major_shareholder_of: "major shareholder"
+  /location/location/contains: "location"
+  /business/company/founders: "founders"
+  /business/person/company: "company"
+  /business/company/advisors: "advisor"
+  /people/deceased_person/place_of_death: "place of death"
+  /business/company/industry: "industry"
+  /people/person/ethnicity: "ethnic background"
+  /people/person/place_of_birth: "place of birth"
+  /location/administrative_division/country: "country of an administration division"
+  /people/person/place_lived: "place lived"
+  /sports/sports_team_location/teams: "sports team"
+  /people/person/children: "child"
+  /people/person/religion: "religion"
+  /location/neighborhood/neighborhood_of: "neighborhood"
+  /location/country/capital: "capital"
+  /business/company/place_founded: "company founded location"
+  /people/person/profession: "occupation"
+train_dataset:
+  _target_: "relik.reader.relik_reader_re_data.RelikREDataset"
+  transformer_model: "${model.model.transformer_model}"
+  materialize_samples: False
+  shuffle_candidates: False
+  flip_candidates: 1.0
+  noise_param: 0.0
+  for_inference: False
+  tokens_per_batch: 4096
+  min_length: -1
+  special_symbols: null
+  relations_definitions: ${data.relations_definitions}
+  sorting_fields:
+    - "predictable_candidates"
+val_dataset:
+  _target_: "relik.reader.relik_reader_re_data.RelikREDataset"
+  transformer_model: "${model.model.transformer_model}"
+  materialize_samples: False
+  shuffle_candidates: False
+  flip_candidates: False
+  for_inference: True
+  min_length: -1
+  special_symbols: null
+  relations_definitions: ${data.relations_definitions}

relik/reader/conf/training/base.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+seed: 94
+trainer:
+  _target_: lightning.Trainer
+  devices:
+    - 0
+  precision: "16-mixed"
+  max_steps: 50000
+  val_check_interval: 1.0
+  num_sanity_val_steps: 0
+  limit_val_batches: 1
+  gradient_clip_val: 1.0

relik/reader/conf/training/re.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+seed: 15
+trainer:
+  _target_: lightning.Trainer
+  devices:
+    - 0
+  precision: "16-mixed"
+  max_steps: 100000
+  val_check_interval: 1.0
+  num_sanity_val_steps: 0
+  limit_val_batches: 1
+  gradient_clip_val: 1.0

relik/reader/data/__init__.py ADDED Viewed

File without changes

relik/reader/data/patches.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing import List
+from relik.reader.data.relik_reader_sample import RelikReaderSample
+from relik.reader.utils.special_symbols import NME_SYMBOL
+def merge_patches_predictions(sample) -> None:
+    sample._d["predicted_window_labels"] = dict()
+    predicted_window_labels = sample._d["predicted_window_labels"]
+    sample._d["span_title_probabilities"] = dict()
+    span_title_probabilities = sample._d["span_title_probabilities"]
+    span2title = dict()
+    for _, patch_info in sorted(sample.patches.items(), key=lambda x: x[0]):
+        # selecting span predictions
+        for predicted_title, predicted_spans in patch_info[
+            "predicted_window_labels"
+        ].items():
+            for pred_span in predicted_spans:
+                pred_span = tuple(pred_span)
+                curr_title = span2title.get(pred_span)
+                if curr_title is None or curr_title == NME_SYMBOL:
+                    span2title[pred_span] = predicted_title
+                # else:
+                #     print("Merging at patch level")
+        # selecting span predictions probability
+        for predicted_span, titles_probabilities in patch_info[
+            "span_title_probabilities"
+        ].items():
+            if predicted_span not in span_title_probabilities:
+                span_title_probabilities[predicted_span] = titles_probabilities
+    for span, title in span2title.items():
+        if title not in predicted_window_labels:
+            predicted_window_labels[title] = list()
+        predicted_window_labels[title].append(span)
+def remove_duplicate_samples(
+    samples: List[RelikReaderSample],
+) -> List[RelikReaderSample]:
+    seen_sample = set()
+    samples_store = []
+    for sample in samples:
+        sample_id = f"{sample.doc_id}#{sample.sent_id}#{sample.offset}"
+        if sample_id not in seen_sample:
+            seen_sample.add(sample_id)
+            samples_store.append(sample)
+    return samples_store

relik/reader/data/relik_reader_data.py ADDED Viewed

	@@ -0,0 +1,965 @@

+import logging
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
+import numpy as np
+import torch
+from torch.utils.data import IterableDataset
+from tqdm import tqdm
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from relik.reader.data.relik_reader_data_utils import (
+    add_noise_to_value,
+    batchify,
+    chunks,
+    flatten,
+)
+from relik.reader.data.relik_reader_sample import (
+    RelikReaderSample,
+    load_relik_reader_samples,
+)
+from relik.reader.utils.special_symbols import NME_SYMBOL
+logger = logging.getLogger(__name__)
+def preprocess_dataset(
+    input_dataset: Iterable[dict],
+    transformer_model: str,
+    add_topic: bool,
+) -> Iterable[dict]:
+    tokenizer = AutoTokenizer.from_pretrained(transformer_model)
+    for dataset_elem in tqdm(input_dataset, desc="Preprocessing input dataset"):
+        if len(dataset_elem["tokens"]) == 0:
+            print(
+                f"Dataset element with doc id: {dataset_elem['doc_id']}",
+                f"and offset {dataset_elem['offset']} does not contain any token",
+                "Skipping it",
+            )
+            continue
+        new_dataset_elem = dict(
+            doc_id=dataset_elem["doc_id"],
+            offset=dataset_elem["offset"],
+        )
+        tokenization_out = tokenizer(
+            dataset_elem["tokens"],
+            return_offsets_mapping=True,
+            add_special_tokens=False,
+        )
+        window_tokens = tokenization_out.input_ids
+        window_tokens = flatten(window_tokens)
+        offsets_mapping = [
+            [
+                (
+                    ss + dataset_elem["token2char_start"][str(i)],
+                    se + dataset_elem["token2char_start"][str(i)],
+                )
+                for ss, se in tokenization_out.offset_mapping[i]
+            ]
+            for i in range(len(dataset_elem["tokens"]))
+        ]
+        offsets_mapping = flatten(offsets_mapping)
+        assert len(offsets_mapping) == len(window_tokens)
+        window_tokens = (
+            [tokenizer.cls_token_id] + window_tokens + [tokenizer.sep_token_id]
+        )
+        topic_offset = 0
+        if add_topic:
+            topic_tokens = tokenizer(
+                dataset_elem["doc_topic"], add_special_tokens=False
+            ).input_ids
+            topic_offset = len(topic_tokens)
+            new_dataset_elem["topic_tokens"] = topic_offset
+            window_tokens = window_tokens[:1] + topic_tokens + window_tokens[1:]
+        new_dataset_elem.update(
+            dict(
+                tokens=window_tokens,
+                token2char_start={
+                    str(i): s
+                    for i, (s, _) in enumerate(offsets_mapping, start=topic_offset)
+                },
+                token2char_end={
+                    str(i): e
+                    for i, (_, e) in enumerate(offsets_mapping, start=topic_offset)
+                },
+                window_candidates=dataset_elem["window_candidates"],
+                window_candidates_scores=dataset_elem.get("window_candidates_scores"),
+            )
+        )
+        if "window_labels" in dataset_elem:
+            window_labels = [
+                (s, e, l.replace("_", " ")) for s, e, l in dataset_elem["window_labels"]
+            ]
+            new_dataset_elem["window_labels"] = window_labels
+            if not all(
+                [
+                    s in new_dataset_elem["token2char_start"].values()
+                    for s, _, _ in new_dataset_elem["window_labels"]
+                ]
+            ):
+                print(
+                    "Mismatching token start char mapping with labels",
+                    new_dataset_elem["token2char_start"],
+                    new_dataset_elem["window_labels"],
+                    dataset_elem["tokens"],
+                )
+                continue
+            if not all(
+                [
+                    e in new_dataset_elem["token2char_end"].values()
+                    for _, e, _ in new_dataset_elem["window_labels"]
+                ]
+            ):
+                print(
+                    "Mismatching token end char mapping with labels",
+                    new_dataset_elem["token2char_end"],
+                    new_dataset_elem["window_labels"],
+                    dataset_elem["tokens"],
+                )
+                continue
+        yield new_dataset_elem
+def preprocess_sample(
+    relik_sample: RelikReaderSample,
+    tokenizer,
+    lowercase_policy: float,
+    add_topic: bool = False,
+) -> None:
+    if len(relik_sample.tokens) == 0:
+        return
+    if lowercase_policy > 0:
+        lc_tokens = np.random.uniform(0, 1, len(relik_sample.tokens)) < lowercase_policy
+        relik_sample.tokens = [
+            t.lower() if lc else t for t, lc in zip(relik_sample.tokens, lc_tokens)
+        ]
+    tokenization_out = tokenizer(
+        relik_sample.tokens,
+        return_offsets_mapping=True,
+        add_special_tokens=False,
+    )
+    window_tokens = tokenization_out.input_ids
+    window_tokens = flatten(window_tokens)
+    offsets_mapping = [
+        [
+            (
+                ss + relik_sample.token2char_start[str(i)],
+                se + relik_sample.token2char_start[str(i)],
+            )
+            for ss, se in tokenization_out.offset_mapping[i]
+        ]
+        for i in range(len(relik_sample.tokens))
+    ]
+    offsets_mapping = flatten(offsets_mapping)
+    assert len(offsets_mapping) == len(window_tokens)
+    window_tokens = [tokenizer.cls_token_id] + window_tokens + [tokenizer.sep_token_id]
+    topic_offset = 0
+    if add_topic:
+        topic_tokens = tokenizer(
+            relik_sample.doc_topic, add_special_tokens=False
+        ).input_ids
+        topic_offset = len(topic_tokens)
+        relik_sample.topic_tokens = topic_offset
+        window_tokens = window_tokens[:1] + topic_tokens + window_tokens[1:]
+    relik_sample._d.update(
+        dict(
+            tokens=window_tokens,
+            token2char_start={
+                str(i): s
+                for i, (s, _) in enumerate(offsets_mapping, start=topic_offset)
+            },
+            token2char_end={
+                str(i): e
+                for i, (_, e) in enumerate(offsets_mapping, start=topic_offset)
+            },
+        )
+    )
+    if "window_labels" in relik_sample._d:
+        relik_sample.window_labels = [
+            (s, e, l.replace("_", " ")) for s, e, l in relik_sample.window_labels
+        ]
+class TokenizationOutput(NamedTuple):
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    token_type_ids: torch.Tensor
+    prediction_mask: torch.Tensor
+    special_symbols_mask: torch.Tensor
+class RelikDataset(IterableDataset):
+    def __init__(
+        self,
+        dataset_path: Optional[str],
+        materialize_samples: bool,
+        transformer_model: Union[str, PreTrainedTokenizer],
+        special_symbols: List[str],
+        shuffle_candidates: Optional[Union[bool, float]] = False,
+        for_inference: bool = False,
+        noise_param: float = 0.1,
+        sorting_fields: Optional[str] = None,
+        tokens_per_batch: int = 2048,
+        batch_size: int = None,
+        max_batch_size: int = 128,
+        section_size: int = 50_000,
+        prebatch: bool = True,
+        random_drop_gold_candidates: float = 0.0,
+        use_nme: bool = True,
+        max_subwords_per_candidate: bool = 22,
+        mask_by_instances: bool = False,
+        min_length: int = 5,
+        max_length: int = 2048,
+        model_max_length: int = 1000,
+        split_on_cand_overload: bool = True,
+        skip_empty_training_samples: bool = False,
+        drop_last: bool = False,
+        samples: Optional[Iterator[RelikReaderSample]] = None,
+        lowercase_policy: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dataset_path = dataset_path
+        self.materialize_samples = materialize_samples
+        self.samples: Optional[List[RelikReaderSample]] = None
+        if self.materialize_samples:
+            self.samples = list()
+        if isinstance(transformer_model, str):
+            self.tokenizer = self._build_tokenizer(transformer_model, special_symbols)
+        else:
+            self.tokenizer = transformer_model
+        self.special_symbols = special_symbols
+        self.shuffle_candidates = shuffle_candidates
+        self.for_inference = for_inference
+        self.noise_param = noise_param
+        self.batching_fields = ["input_ids"]
+        self.sorting_fields = (
+            sorting_fields if sorting_fields is not None else self.batching_fields
+        )
+        self.tokens_per_batch = tokens_per_batch
+        self.batch_size = batch_size
+        self.max_batch_size = max_batch_size
+        self.section_size = section_size
+        self.prebatch = prebatch
+        self.random_drop_gold_candidates = random_drop_gold_candidates
+        self.use_nme = use_nme
+        self.max_subwords_per_candidate = max_subwords_per_candidate
+        self.mask_by_instances = mask_by_instances
+        self.min_length = min_length
+        self.max_length = max_length
+        self.model_max_length = (
+            model_max_length
+            if model_max_length < self.tokenizer.model_max_length
+            else self.tokenizer.model_max_length
+        )
+        # retrocompatibility workaround
+        self.transformer_model = (
+            transformer_model
+            if isinstance(transformer_model, str)
+            else transformer_model.name_or_path
+        )
+        self.split_on_cand_overload = split_on_cand_overload
+        self.skip_empty_training_samples = skip_empty_training_samples
+        self.drop_last = drop_last
+        self.lowercase_policy = lowercase_policy
+        self.samples = samples
+    def _build_tokenizer(self, transformer_model: str, special_symbols: List[str]):
+        return AutoTokenizer.from_pretrained(
+            transformer_model,
+            additional_special_tokens=[ss for ss in special_symbols],
+            add_prefix_space=True,
+        )
+    @property
+    def fields_batcher(self) -> Dict[str, Union[None, Callable[[list], Any]]]:
+        fields_batchers = {
+            "input_ids": lambda x: batchify(
+                x, padding_value=self.tokenizer.pad_token_id
+            ),
+            "attention_mask": lambda x: batchify(x, padding_value=0),
+            "token_type_ids": lambda x: batchify(x, padding_value=0),
+            "prediction_mask": lambda x: batchify(x, padding_value=1),
+            "global_attention": lambda x: batchify(x, padding_value=0),
+            "token2word": None,
+            "sample": None,
+            "special_symbols_mask": lambda x: batchify(x, padding_value=False),
+            "start_labels": lambda x: batchify(x, padding_value=-100),
+            "end_labels": lambda x: batchify(x, padding_value=-100),
+            "predictable_candidates_symbols": None,
+            "predictable_candidates": None,
+            "patch_offset": None,
+            "optimus_labels": None,
+        }
+        if "roberta" in self.transformer_model:
+            del fields_batchers["token_type_ids"]
+        return fields_batchers
+    def _build_input_ids(
+        self, sentence_input_ids: List[int], candidates_input_ids: List[List[int]]
+    ) -> List[int]:
+        return (
+            [self.tokenizer.cls_token_id]
+            + sentence_input_ids
+            + [self.tokenizer.sep_token_id]
+            + flatten(candidates_input_ids)
+            + [self.tokenizer.sep_token_id]
+        )
+    def _get_special_symbols_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
+        special_symbols_mask = input_ids >= (
+            len(self.tokenizer) - len(self.special_symbols)
+        )
+        special_symbols_mask[0] = True
+        return special_symbols_mask
+    def _build_tokenizer_essentials(
+        self, input_ids, original_sequence, sample
+    ) -> TokenizationOutput:
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        attention_mask = torch.ones_like(input_ids)
+        total_sequence_len = len(input_ids)
+        predictable_sentence_len = len(original_sequence)
+        # token type ids
+        token_type_ids = torch.cat(
+            [
+                input_ids.new_zeros(
+                    predictable_sentence_len + 2
+                ),  # original sentence bpes + CLS and SEP
+                input_ids.new_ones(total_sequence_len - predictable_sentence_len - 2),
+            ]
+        )
+        # prediction mask -> boolean on tokens that are predictable
+        prediction_mask = torch.tensor(
+            [1]
+            + ([0] * predictable_sentence_len)
+            + ([1] * (total_sequence_len - predictable_sentence_len - 1))
+        )
+        # add topic tokens to the prediction mask so that they cannot be predicted
+        # or optimized during training
+        topic_tokens = getattr(sample, "topic_tokens", None)
+        if topic_tokens is not None:
+            prediction_mask[1 : 1 + topic_tokens] = 1
+        # If mask by instances is active the prediction mask is applied to everything
+        # that is not indicated as an instance in the training set.
+        if self.mask_by_instances:
+            char_start2token = {
+                cs: int(tok) for tok, cs in sample.token2char_start.items()
+            }
+            char_end2token = {ce: int(tok) for tok, ce in sample.token2char_end.items()}
+            instances_mask = torch.ones_like(prediction_mask)
+            for _, span_info in sample.instance_id2span_data.items():
+                span_info = span_info[0]
+                token_start = char_start2token[span_info[0]] + 1  # +1 for the CLS
+                token_end = char_end2token[span_info[1]] + 1  # +1 for the CLS
+                instances_mask[token_start : token_end + 1] = 0
+            prediction_mask += instances_mask
+            prediction_mask[prediction_mask > 1] = 1
+        assert len(prediction_mask) == len(input_ids)
+        # special symbols mask
+        special_symbols_mask = self._get_special_symbols_mask(input_ids)
+        return TokenizationOutput(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            prediction_mask,
+            special_symbols_mask,
+        )
+    def _build_labels(
+        self,
+        sample,
+        tokenization_output: TokenizationOutput,
+        predictable_candidates: List[str],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        start_labels = [0] * len(tokenization_output.input_ids)
+        end_labels = [0] * len(tokenization_output.input_ids)
+        char_start2token = {v: int(k) for k, v in sample.token2char_start.items()}
+        char_end2token = {v: int(k) for k, v in sample.token2char_end.items()}
+        for cs, ce, gold_candidate_title in sample.window_labels:
+            if gold_candidate_title not in predictable_candidates:
+                if self.use_nme:
+                    gold_candidate_title = NME_SYMBOL
+                else:
+                    continue
+            # +1 is to account for the CLS token
+            start_bpe = char_start2token[cs] + 1
+            end_bpe = char_end2token[ce] + 1
+            class_index = predictable_candidates.index(gold_candidate_title)
+            if (
+                start_labels[start_bpe] == 0 and end_labels[end_bpe] == 0
+            ):  # prevent from having entities that ends with the same label
+                start_labels[start_bpe] = class_index + 1  # +1 for the NONE class
+                end_labels[end_bpe] = class_index + 1  # +1 for the NONE class
+            else:
+                print(
+                    "Found entity with the same last subword, it will not be included."
+                )
+                print(
+                    cs,
+                    ce,
+                    gold_candidate_title,
+                    start_labels,
+                    end_labels,
+                    sample.doc_id,
+                )
+        ignored_labels_indices = tokenization_output.prediction_mask == 1
+        start_labels = torch.tensor(start_labels, dtype=torch.long)
+        start_labels[ignored_labels_indices] = -100
+        end_labels = torch.tensor(end_labels, dtype=torch.long)
+        end_labels[ignored_labels_indices] = -100
+        return start_labels, end_labels
+    def produce_sample_bag(
+        self, sample, predictable_candidates: List[str], candidates_starting_offset: int
+    ) -> Optional[Tuple[dict, list, int]]:
+        # input sentence tokenization
+        input_subwords = sample.tokens[1:-1]  # removing special tokens
+        candidates_symbols = self.special_symbols[candidates_starting_offset:]
+        predictable_candidates = list(predictable_candidates)
+        original_predictable_candidates = list(predictable_candidates)
+        # add NME as a possible candidate
+        if self.use_nme:
+            predictable_candidates.insert(0, NME_SYMBOL)
+        # candidates encoding
+        candidates_symbols = candidates_symbols[: len(predictable_candidates)]
+        candidates_encoding_result = self.tokenizer.batch_encode_plus(
+            [
+                "{} {}".format(cs, ct) if ct != NME_SYMBOL else NME_SYMBOL
+                for cs, ct in zip(candidates_symbols, predictable_candidates)
+            ],
+            add_special_tokens=False,
+        ).input_ids
+        if (
+            self.max_subwords_per_candidate is not None
+            and self.max_subwords_per_candidate > 0
+        ):
+            candidates_encoding_result = [
+                cer[: self.max_subwords_per_candidate]
+                for cer in candidates_encoding_result
+            ]
+        # drop candidates if the number of input tokens is too long for the model
+        if (
+            sum(map(len, candidates_encoding_result))
+            + len(input_subwords)
+            + 20  # + 20 special tokens
+            > self.model_max_length
+        ):
+            acceptable_tokens_from_candidates = (
+                self.model_max_length - 20 - len(input_subwords)
+            )
+            i = 0
+            cum_len = 0
+            while (
+                cum_len + len(candidates_encoding_result[i])
+                < acceptable_tokens_from_candidates
+            ):
+                cum_len += len(candidates_encoding_result[i])
+                i += 1
+            candidates_encoding_result = candidates_encoding_result[:i]
+            candidates_symbols = candidates_symbols[:i]
+            predictable_candidates = predictable_candidates[:i]
+        # final input_ids build
+        input_ids = self._build_input_ids(
+            sentence_input_ids=input_subwords,
+            candidates_input_ids=candidates_encoding_result,
+        )
+        # complete input building (e.g. attention / prediction mask)
+        tokenization_output = self._build_tokenizer_essentials(
+            input_ids, input_subwords, sample
+        )
+        output_dict = {
+            "input_ids": tokenization_output.input_ids,
+            "attention_mask": tokenization_output.attention_mask,
+            "token_type_ids": tokenization_output.token_type_ids,
+            "prediction_mask": tokenization_output.prediction_mask,
+            "special_symbols_mask": tokenization_output.special_symbols_mask,
+            "sample": sample,
+            "predictable_candidates_symbols": candidates_symbols,
+            "predictable_candidates": predictable_candidates,
+        }
+        # labels creation
+        if sample.window_labels is not None:
+            start_labels, end_labels = self._build_labels(
+                sample,
+                tokenization_output,
+                predictable_candidates,
+            )
+            output_dict.update(start_labels=start_labels, end_labels=end_labels)
+        if (
+            "roberta" in self.transformer_model
+            or "longformer" in self.transformer_model
+        ):
+            del output_dict["token_type_ids"]
+        predictable_candidates_set = set(predictable_candidates)
+        remaining_candidates = [
+            candidate
+            for candidate in original_predictable_candidates
+            if candidate not in predictable_candidates_set
+        ]
+        total_used_candidates = (
+            candidates_starting_offset
+            + len(predictable_candidates)
+            - (1 if self.use_nme else 0)
+        )
+        if self.use_nme:
+            assert predictable_candidates[0] == NME_SYMBOL
+        return output_dict, remaining_candidates, total_used_candidates
+    def __iter__(self):
+        dataset_iterator = self.dataset_iterator_func()
+        current_dataset_elements = []
+        i = None
+        for i, dataset_elem in enumerate(dataset_iterator, start=1):
+            if (
+                self.section_size is not None
+                and len(current_dataset_elements) == self.section_size
+            ):
+                for batch in self.materialize_batches(current_dataset_elements):
+                    yield batch
+                current_dataset_elements = []
+            current_dataset_elements.append(dataset_elem)
+            if i % 50_000 == 0:
+                logger.info(f"Processed: {i} number of elements")
+        if len(current_dataset_elements) != 0:
+            for batch in self.materialize_batches(current_dataset_elements):
+                yield batch
+        if i is not None:
+            logger.info(f"Dataset finished: {i} number of elements processed")
+        else:
+            logger.warning("Dataset empty")
+    def dataset_iterator_func(self):
+        skipped_instances = 0
+        data_samples = (
+            load_relik_reader_samples(self.dataset_path)
+            if self.samples is None
+            else self.samples
+        )
+        for sample in data_samples:
+            preprocess_sample(
+                sample, self.tokenizer, lowercase_policy=self.lowercase_policy
+            )
+            current_patch = 0
+            sample_bag, used_candidates = None, None
+            remaining_candidates = list(sample.window_candidates)
+            if not self.for_inference:
+                # randomly drop gold candidates at training time
+                if (
+                    self.random_drop_gold_candidates > 0.0
+                    and np.random.uniform() < self.random_drop_gold_candidates
+                    and len(set(ct for _, _, ct in sample.window_labels)) > 1
+                ):
+                    # selecting candidates to drop
+                    np.random.shuffle(sample.window_labels)
+                    n_dropped_candidates = np.random.randint(
+                        0, len(sample.window_labels) - 1
+                    )
+                    dropped_candidates = [
+                        label_elem[-1]
+                        for label_elem in sample.window_labels[:n_dropped_candidates]
+                    ]
+                    dropped_candidates = set(dropped_candidates)
+                    # saving NMEs because they should not be dropped
+                    if NME_SYMBOL in dropped_candidates:
+                        dropped_candidates.remove(NME_SYMBOL)
+                    # sample update
+                    sample.window_labels = [
+                        (s, e, _l)
+                        if _l not in dropped_candidates
+                        else (s, e, NME_SYMBOL)
+                        for s, e, _l in sample.window_labels
+                    ]
+                    remaining_candidates = [
+                        wc
+                        for wc in remaining_candidates
+                        if wc not in dropped_candidates
+                    ]
+                # shuffle candidates
+                if (
+                    isinstance(self.shuffle_candidates, bool)
+                    and self.shuffle_candidates
+                ) or (
+                    isinstance(self.shuffle_candidates, float)
+                    and np.random.uniform() < self.shuffle_candidates
+                ):
+                    np.random.shuffle(remaining_candidates)
+            while len(remaining_candidates) != 0:
+                sample_bag = self.produce_sample_bag(
+                    sample,
+                    predictable_candidates=remaining_candidates,
+                    candidates_starting_offset=used_candidates
+                    if used_candidates is not None
+                    else 0,
+                )
+                if sample_bag is not None:
+                    sample_bag, remaining_candidates, used_candidates = sample_bag
+                    if (
+                        self.for_inference
+                        or not self.skip_empty_training_samples
+                        or (
+                            (
+                                sample_bag.get("start_labels") is not None
+                                and torch.any(sample_bag["start_labels"] > 1).item()
+                            )
+                            or (
+                                sample_bag.get("optimus_labels") is not None
+                                and len(sample_bag["optimus_labels"]) > 0
+                            )
+                        )
+                    ):
+                        sample_bag["patch_offset"] = current_patch
+                        current_patch += 1
+                        yield sample_bag
+                    else:
+                        skipped_instances += 1
+                        if skipped_instances % 1000 == 0 and skipped_instances != 0:
+                            logger.info(
+                                f"Skipped {skipped_instances} instances since they did not have any gold labels..."
+                            )
+                # Just use the first fitting candidates if split on
+                #  cand is not True
+                if not self.split_on_cand_overload:
+                    break
+    def preshuffle_elements(self, dataset_elements: List):
+        # This shuffling is done so that when using the sorting function,
+        # if it is deterministic given a collection and its order, we will
+        # make the whole operation not deterministic anymore.
+        # Basically, the aim is not to build every time the same batches.
+        if not self.for_inference:
+            dataset_elements = np.random.permutation(dataset_elements)
+        sorting_fn = (
+            lambda elem: add_noise_to_value(
+                sum(len(elem[k]) for k in self.sorting_fields),
+                noise_param=self.noise_param,
+            )
+            if not self.for_inference
+            else sum(len(elem[k]) for k in self.sorting_fields)
+        )
+        dataset_elements = sorted(dataset_elements, key=sorting_fn)
+        if self.for_inference:
+            return dataset_elements
+        ds = list(chunks(dataset_elements, 64))
+        np.random.shuffle(ds)
+        return flatten(ds)
+    def materialize_batches(
+        self, dataset_elements: List[Dict[str, Any]]
+    ) -> Generator[Dict[str, Any], None, None]:
+        if self.prebatch:
+            dataset_elements = self.preshuffle_elements(dataset_elements)
+        current_batch = []
+        # function that creates a batch from the 'current_batch' list
+        def output_batch() -> Dict[str, Any]:
+            assert (
+                len(
+                    set([len(elem["predictable_candidates"]) for elem in current_batch])
+                )
+                == 1
+            ), " ".join(
+                map(
+                    str, [len(elem["predictable_candidates"]) for elem in current_batch]
+                )
+            )
+            batch_dict = dict()
+            de_values_by_field = {
+                fn: [de[fn] for de in current_batch if fn in de]
+                for fn in self.fields_batcher
+            }
+            # in case you provide fields batchers but in the batch
+            # there are no elements for that field
+            de_values_by_field = {
+                fn: fvs for fn, fvs in de_values_by_field.items() if len(fvs) > 0
+            }
+            assert len(set([len(v) for v in de_values_by_field.values()]))
+            # todo: maybe we should report the user about possible
+            #  fields filtering due to "None" instances
+            de_values_by_field = {
+                fn: fvs
+                for fn, fvs in de_values_by_field.items()
+                if all([fv is not None for fv in fvs])
+            }
+            for field_name, field_values in de_values_by_field.items():
+                field_batch = (
+                    self.fields_batcher[field_name](field_values)
+                    if self.fields_batcher[field_name] is not None
+                    else field_values
+                )
+                batch_dict[field_name] = field_batch
+            return batch_dict
+        max_len_discards, min_len_discards = 0, 0
+        should_token_batch = self.batch_size is None
+        curr_pred_elements = -1
+        for de in dataset_elements:
+            if (
+                should_token_batch
+                and self.max_batch_size != -1
+                and len(current_batch) == self.max_batch_size
+            ) or (not should_token_batch and len(current_batch) == self.batch_size):
+                yield output_batch()
+                current_batch = []
+                curr_pred_elements = -1
+            too_long_fields = [
+                k
+                for k in de
+                if self.max_length != -1
+                and torch.is_tensor(de[k])
+                and len(de[k]) > self.max_length
+            ]
+            if len(too_long_fields) > 0:
+                max_len_discards += 1
+                continue
+            too_short_fields = [
+                k
+                for k in de
+                if self.min_length != -1
+                and torch.is_tensor(de[k])
+                and len(de[k]) < self.min_length
+            ]
+            if len(too_short_fields) > 0:
+                min_len_discards += 1
+                continue
+            if should_token_batch:
+                de_len = sum(len(de[k]) for k in self.batching_fields)
+                future_max_len = max(
+                    de_len,
+                    max(
+                        [
+                            sum(len(bde[k]) for k in self.batching_fields)
+                            for bde in current_batch
+                        ],
+                        default=0,
+                    ),
+                )
+                future_tokens_per_batch = future_max_len * (len(current_batch) + 1)
+                num_predictable_candidates = len(de["predictable_candidates"])
+                if len(current_batch) > 0 and (
+                    future_tokens_per_batch >= self.tokens_per_batch
+                    or (
+                        num_predictable_candidates != curr_pred_elements
+                        and curr_pred_elements != -1
+                    )
+                ):
+                    yield output_batch()
+                    current_batch = []
+            current_batch.append(de)
+            curr_pred_elements = len(de["predictable_candidates"])
+        if len(current_batch) != 0 and not self.drop_last:
+            yield output_batch()
+        if max_len_discards > 0:
+            if self.for_inference:
+                logger.warning(
+                    f"WARNING: Inference mode is True but {max_len_discards} samples longer than max length were "
+                    f"found. The {max_len_discards} samples will be DISCARDED. If you are doing some kind of evaluation"
+                    f", this can INVALIDATE results. This might happen if the max length was not set to -1 or if the "
+                    f"sample length exceeds the maximum length supported by the current model."
+                )
+            else:
+                logger.warning(
+                    f"During iteration, {max_len_discards} elements were "
+                    f"discarded since longer than max length {self.max_length}"
+                )
+        if min_len_discards > 0:
+            if self.for_inference:
+                logger.warning(
+                    f"WARNING: Inference mode is True but {min_len_discards} samples shorter than min length were "
+                    f"found. The {min_len_discards} samples will be DISCARDED. If you are doing some kind of evaluation"
+                    f", this can INVALIDATE results. This might happen if the min length was not set to -1 or if the "
+                    f"sample length is shorter than the minimum length supported by the current model."
+                )
+            else:
+                logger.warning(
+                    f"During iteration, {min_len_discards} elements were "
+                    f"discarded since shorter than min length {self.min_length}"
+                )
+    @staticmethod
+    def convert_tokens_to_char_annotations(
+        sample: RelikReaderSample,
+        remove_nmes: bool = True,
+    ) -> RelikReaderSample:
+        """
+        Converts the token annotations to char annotations.
+        Args:
+            sample (:obj:`RelikReaderSample`):
+                The sample to convert.
+            remove_nmes (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether to remove the NMEs from the annotations.
+        Returns:
+            :obj:`RelikReaderSample`: The converted sample.
+        """
+        char_annotations = set()
+        for (
+            predicted_entity,
+            predicted_spans,
+        ) in sample.predicted_window_labels.items():
+            if predicted_entity == NME_SYMBOL and remove_nmes:
+                continue
+            for span_start, span_end in predicted_spans:
+                span_start = sample.token2char_start[str(span_start)]
+                span_end = sample.token2char_end[str(span_end)]
+                char_annotations.add((span_start, span_end, predicted_entity))
+        char_probs_annotations = dict()
+        for (
+            span_start,
+            span_end,
+        ), candidates_probs in sample.span_title_probabilities.items():
+            span_start = sample.token2char_start[str(span_start)]
+            span_end = sample.token2char_end[str(span_end)]
+            char_probs_annotations[(span_start, span_end)] = {
+                title for title, _ in candidates_probs
+            }
+        sample.predicted_window_labels_chars = char_annotations
+        sample.probs_window_labels_chars = char_probs_annotations
+        return sample
+    @staticmethod
+    def merge_patches_predictions(sample) -> None:
+        sample._d["predicted_window_labels"] = dict()
+        predicted_window_labels = sample._d["predicted_window_labels"]
+        sample._d["span_title_probabilities"] = dict()
+        span_title_probabilities = sample._d["span_title_probabilities"]
+        span2title = dict()
+        for _, patch_info in sorted(sample.patches.items(), key=lambda x: x[0]):
+            # selecting span predictions
+            for predicted_title, predicted_spans in patch_info[
+                "predicted_window_labels"
+            ].items():
+                for pred_span in predicted_spans:
+                    pred_span = tuple(pred_span)
+                    curr_title = span2title.get(pred_span)
+                    if curr_title is None or curr_title == NME_SYMBOL:
+                        span2title[pred_span] = predicted_title
+                    # else:
+                    #     print("Merging at patch level")
+            # selecting span predictions probability
+            for predicted_span, titles_probabilities in patch_info[
+                "span_title_probabilities"
+            ].items():
+                if predicted_span not in span_title_probabilities:
+                    span_title_probabilities[predicted_span] = titles_probabilities
+        for span, title in span2title.items():
+            if title not in predicted_window_labels:
+                predicted_window_labels[title] = list()
+            predicted_window_labels[title].append(span)

relik/reader/data/relik_reader_data_utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing import List
+import numpy as np
+import torch
+def flatten(lsts: List[list]) -> list:
+    acc_lst = list()
+    for lst in lsts:
+        acc_lst.extend(lst)
+    return acc_lst
+def batchify(tensors: List[torch.Tensor], padding_value: int = 0) -> torch.Tensor:
+    return torch.nn.utils.rnn.pad_sequence(
+        tensors, batch_first=True, padding_value=padding_value
+    )
+def batchify_matrices(tensors: List[torch.Tensor], padding_value: int) -> torch.Tensor:
+    x = max([t.shape[0] for t in tensors])
+    y = max([t.shape[1] for t in tensors])
+    out_matrix = torch.zeros((len(tensors), x, y))
+    out_matrix += padding_value
+    for i, tensor in enumerate(tensors):
+        out_matrix[i][0 : tensor.shape[0], 0 : tensor.shape[1]] = tensor
+    return out_matrix
+def batchify_tensor(tensors: List[torch.Tensor], padding_value: int) -> torch.Tensor:
+    x = max([t.shape[0] for t in tensors])
+    y = max([t.shape[1] for t in tensors])
+    rest = tensors[0].shape[2]
+    out_matrix = torch.zeros((len(tensors), x, y, rest))
+    out_matrix += padding_value
+    for i, tensor in enumerate(tensors):
+        out_matrix[i][0 : tensor.shape[0], 0 : tensor.shape[1], :] = tensor
+    return out_matrix
+def chunks(lst: list, chunk_size: int) -> List[list]:
+    chunks_acc = list()
+    for i in range(0, len(lst), chunk_size):
+        chunks_acc.append(lst[i : i + chunk_size])
+    return chunks_acc
+def add_noise_to_value(value: int, noise_param: float):
+    noise_value = value * noise_param
+    noise = np.random.uniform(-noise_value, noise_value)
+    return max(1, value + noise)

relik/reader/data/relik_reader_sample.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import json
+from typing import Iterable
+class RelikReaderSample:
+    def __init__(self, **kwargs):
+        super().__setattr__("_d", {})
+        self._d = kwargs
+    def __getattribute__(self, item):
+        return super(RelikReaderSample, self).__getattribute__(item)
+    def __getattr__(self, item):
+        if item.startswith("__") and item.endswith("__"):
+            # this is likely some python library-specific variable (such as __deepcopy__ for copy)
+            # better follow standard behavior here
+            raise AttributeError(item)
+        elif item in self._d:
+            return self._d[item]
+        else:
+            return None
+    def __setattr__(self, key, value):
+        if key in self._d:
+            self._d[key] = value
+        else:
+            super().__setattr__(key, value)
+    def to_jsons(self) -> str:
+        if "predicted_window_labels" in self._d:
+            new_obj = {
+                k: v
+                for k, v in self._d.items()
+                if k != "predicted_window_labels" and k != "span_title_probabilities"
+            }
+            new_obj["predicted_window_labels"] = [
+                [ss, se, pred_title]
+                for (ss, se), pred_title in self.predicted_window_labels_chars
+            ]
+        else:
+            return json.dumps(self._d)
+def load_relik_reader_samples(path: str) -> Iterable[RelikReaderSample]:
+    with open(path) as f:
+        for line in f:
+            jsonl_line = json.loads(line.strip())
+            relik_reader_sample = RelikReaderSample(**jsonl_line)
+            yield relik_reader_sample

relik/reader/lightning_modules/__init__.py ADDED Viewed

File without changes

relik/reader/lightning_modules/relik_reader_pl_module.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Any, Optional
+import lightning
+from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
+from relik.reader.relik_reader_core import RelikReaderCoreModel
+class RelikReaderPLModule(lightning.LightningModule):
+    def __init__(
+        self,
+        cfg: dict,
+        transformer_model: str,
+        additional_special_symbols: int,
+        num_layers: Optional[int] = None,
+        activation: str = "gelu",
+        linears_hidden_size: Optional[int] = 512,
+        use_last_k_layers: int = 1,
+        training: bool = False,
+        *args: Any,
+        **kwargs: Any
+    ):
+        super().__init__(*args, **kwargs)
+        self.save_hyperparameters()
+        self.relik_reader_core_model = RelikReaderCoreModel(
+            transformer_model,
+            additional_special_symbols,
+            num_layers,
+            activation,
+            linears_hidden_size,
+            use_last_k_layers,
+            training=training,
+        )
+        self.optimizer_factory = None
+    def training_step(self, batch: dict, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        relik_output = self.relik_reader_core_model(**batch)
+        self.log("train-loss", relik_output["loss"])
+        return relik_output["loss"]
+    def validation_step(
+        self, batch: dict, *args: Any, **kwargs: Any
+    ) -> Optional[STEP_OUTPUT]:
+        return
+    def set_optimizer_factory(self, optimizer_factory) -> None:
+        self.optimizer_factory = optimizer_factory
+    def configure_optimizers(self) -> OptimizerLRScheduler:
+        return self.optimizer_factory(self.relik_reader_core_model)

relik/reader/lightning_modules/relik_reader_re_pl_module.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from typing import Any, Optional
+import lightning
+from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
+from relik.reader.relik_reader_re import RelikReaderForTripletExtraction
+class RelikReaderREPLModule(lightning.LightningModule):
+    def __init__(
+        self,
+        cfg: dict,
+        transformer_model: str,
+        additional_special_symbols: int,
+        num_layers: Optional[int] = None,
+        activation: str = "gelu",
+        linears_hidden_size: Optional[int] = 512,
+        use_last_k_layers: int = 1,
+        training: bool = False,
+        *args: Any,
+        **kwargs: Any
+    ):
+        super().__init__(*args, **kwargs)
+        self.save_hyperparameters()
+        self.relik_reader_re_model = RelikReaderForTripletExtraction(
+            transformer_model,
+            additional_special_symbols,
+            num_layers,
+            activation,
+            linears_hidden_size,
+            use_last_k_layers,
+            training=training,
+        )
+        self.optimizer_factory = None
+    def training_step(self, batch: dict, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        relik_output = self.relik_reader_re_model(**batch)
+        self.log("train-loss", relik_output["loss"])
+        self.log("train-start_loss", relik_output["ned_start_loss"])
+        self.log("train-end_loss", relik_output["ned_end_loss"])
+        self.log("train-relation_loss", relik_output["re_loss"])
+        return relik_output["loss"]
+    def validation_step(
+        self, batch: dict, *args: Any, **kwargs: Any
+    ) -> Optional[STEP_OUTPUT]:
+        return
+    def set_optimizer_factory(self, optimizer_factory) -> None:
+        self.optimizer_factory = optimizer_factory
+    def configure_optimizers(self) -> OptimizerLRScheduler:
+        return self.optimizer_factory(self.relik_reader_re_model)

relik/reader/pytorch_modules/__init__.py ADDED Viewed

File without changes