In [1]:
from typing import Iterable, Iterator
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [2]:
from langchain.vectorstores import FAISS
path = "/data/tommaso/llm4scilit/data/vector_store"
db = FAISS.load_local(path, model)

In [3]:
db.as_retriever().get_relevant_documents("What are the main serological markers for RA?", metadata={"paper_title": "LC-MS/MS-Based Serum Protein Profiling for Identification of Candidate Biomarkers in Pakistani Rheumatoid Arthritis Patients"})

[Document(page_content='These serum proteins have strong potential to serve as diagnostic and prognostic biomarkers of RA and can also be evaluated to fill the gaps in the current knowledge of pathogenesis of RA.These\n\nfindings can be validated in larger cohorts from different populations to identify diagnostic and prognostic biomarkers of RA.', metadata={'text': 'RA is a complex disease that is influenced by an intricate interactome of various environmental, genetic and microbial factors that influence the immune homeostasis.Owing to the complex genetic architecture accompanied by a plethora of microbial and environmental triggers that an organism is exposed to this has made the identification of diagnostic and prognostic markers challenging.Our study has explored the serum proteomics of this complex autoimmune disorder in a relatively understudied Pakistani population to identify disease biomarkers that are DE among various serotypes of RA patients and healthy controls.We identifie

In [4]:
db.index.ntotal

60

In [5]:
from pathlib import Path
DATA_PATH = Path("/data/tommaso/llm4scilit/data")

In [6]:
import glob
glob.glob(str(DATA_PATH / "papers/*"))

['/data/tommaso/llm4scilit/data/papers/3.pdf',
 '/data/tommaso/llm4scilit/data/papers/2.pdf',
 '/data/tommaso/llm4scilit/data/papers/7.pdf',
 '/data/tommaso/llm4scilit/data/papers/1.pdf',
 '/data/tommaso/llm4scilit/data/papers/6.pdf',
 '/data/tommaso/llm4scilit/data/papers/10.pdf',
 '/data/tommaso/llm4scilit/data/papers/5.pdf',
 '/data/tommaso/llm4scilit/data/papers/4.pdf',
 '/data/tommaso/llm4scilit/data/papers/9.pdf',
 '/data/tommaso/llm4scilit/data/papers/8.pdf']

In [9]:
from langchain.document_loaders.parsers import GrobidParser
from langchain.document_loaders.generic import GenericLoader

loader = GenericLoader.from_filesystem(
    DATA_PATH / "papers/",
    glob="2.pdf",
    suffixes=[".pdf"],
    parser=GrobidParser(segment_sentences=False),
)
docs = loader.load()
docs

[Document(page_content='We determined that 144 proteins showed significant differential abundance between the IA and control SF proteomes, of which 11 protein candidates were selected for future follow-up studies.Similar analyses applied to our peptidomic data identified 15 peptide sequences, originating from 4 protein precursors, to have significant differential abundance in IA compared to the control SF peptidome.Pathway enrichment analysis of the IA SF peptidome along with AMP prediction suggests a possible mechanistic role of microbes in eliciting an immune response which drives the development of IA.', metadata={'text': 'We determined that 144 proteins showed significant differential abundance between the IA and control SF proteomes, of which 11 protein candidates were selected for future follow-up studies.Similar analyses applied to our peptidomic data identified 15 peptide sequences, originating from 4 protein precursors, to have significant differential abundance in IA compared

In [10]:
import spacy
# spacy.require_gpu(gpu_id=1)

import spacy_transformers # needed by SpacyTextSplitter when using the en_core_web_trf pipeline
from langchain.text_splitter import SpacyTextSplitter
from itertools import chain

splitter = SpacyTextSplitter(chunk_size=1000, pipeline="en_core_web_trf")
chunks = splitter.split_documents(docs)
chunks[:5]



[Document(page_content='We determined that 144 proteins showed significant differential abundance between the IA and control SF proteomes, of which 11 protein candidates were selected for future follow-up studies.\n\nSimilar analyses applied to our peptidomic data identified 15 peptide sequences, originating from 4 protein precursors, to have significant differential abundance in IA compared to the control SF peptidome.\n\nPathway enrichment analysis of the IA SF peptidome along with AMP prediction suggests a possible mechanistic role of microbes in eliciting an immune response which drives the development of IA.', metadata={'text': 'We determined that 144 proteins showed significant differential abundance between the IA and control SF proteomes, of which 11 protein candidates were selected for future follow-up studies.Similar analyses applied to our peptidomic data identified 15 peptide sequences, originating from 4 protein precursors, to have significant differential abundance in IA 

In [11]:
db_paper_2 = FAISS.from_documents(chunks, model)

In [12]:
db.merge_from(db_paper_2)

In [13]:
db.as_retriever().get_relevant_documents("What are the main serological markers for RA?", metadata={"paper_title": "LC-MS/MS-Based Serum Protein Profiling for Identification of Candidate Biomarkers in Pakistani Rheumatoid Arthritis Patients"})

[Document(page_content='These serum proteins have strong potential to serve as diagnostic and prognostic biomarkers of RA and can also be evaluated to fill the gaps in the current knowledge of pathogenesis of RA.These\n\nfindings can be validated in larger cohorts from different populations to identify diagnostic and prognostic biomarkers of RA.', metadata={'text': 'RA is a complex disease that is influenced by an intricate interactome of various environmental, genetic and microbial factors that influence the immune homeostasis.Owing to the complex genetic architecture accompanied by a plethora of microbial and environmental triggers that an organism is exposed to this has made the identification of diagnostic and prognostic markers challenging.Our study has explored the serum proteomics of this complex autoimmune disorder in a relatively understudied Pakistani population to identify disease biomarkers that are DE among various serotypes of RA patients and healthy controls.We identifie

In [16]:
results = db.as_retriever().get_relevant_documents("What are the main serological markers for RA?", search_kwargs={"metadata": {"paper_title": "Elucidating the endogenous synovial fluid proteome and peptidome of inflammatory arthritis using label-free mass spectrometry"}})

In [18]:
results[0].metadata["paper_title"]

'LC-MS/MS-Based Serum Protein Profiling for Identification of Candidate Biomarkers in Pakistani Rheumatoid Arthritis Patients'

In [27]:
db.index.ntotal

134

In [19]:
chunks[0].metadata

{'text': 'We determined that 144 proteins showed significant differential abundance between the IA and control SF proteomes, of which 11 protein candidates were selected for future follow-up studies.Similar analyses applied to our peptidomic data identified 15 peptide sequences, originating from 4 protein precursors, to have significant differential abundance in IA compared to the control SF peptidome.Pathway enrichment analysis of the IA SF peptidome along with AMP prediction suggests a possible mechanistic role of microbes in eliciting an immune response which drives the development of IA.',
 'para': '2',
 'bboxes': "[[{'page': '1', 'x': '101.12', 'y': '422.98', 'h': '424.81', 'w': '9.24'}, {'page': '1', 'x': '63.12', 'y': '434.98', 'h': '340.13', 'w': '9.24'}], [{'page': '1', 'x': '405.45', 'y': '434.98', 'h': '120.66', 'w': '9.24'}, {'page': '1', 'x': '63.12', 'y': '446.98', 'h': '468.92', 'w': '9.24'}, {'page': '1', 'x': '63.12', 'y': '458.98', 'h': '225.40', 'w': '9.24'}], [{'pag

In [30]:
[x.metadata["paper_title"] for x in db.as_retriever(search_kwargs={"filter": {"paper_title": "Elucidating the endogenous synovial fluid proteome and peptidome of inflammatory arthritis using label-free mass spectrometry"}}).get_relevant_documents("What are the main serological markers for RA?")]

['Elucidating the endogenous synovial fluid proteome and peptidome of inflammatory arthritis using label-free mass spectrometry',
 'Elucidating the endogenous synovial fluid proteome and peptidome of inflammatory arthritis using label-free mass spectrometry',
 'Elucidating the endogenous synovial fluid proteome and peptidome of inflammatory arthritis using label-free mass spectrometry']

In [32]:
import shlex

shlex.split('ask_paper "usu sus" asd')

['ask_paper', 'usu sus', 'asd']