In [17]:
from pathlib import Path
DATA_PATH = Path("/data/tommaso/data")

In [8]:
from langchain.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs

loader = UnstructuredFileLoader(
    DATA_PATH / "papers_processed" / "1.txt",
    strategy="hi_res",
    mode="elements",
    post_processors=[
        clean_extra_whitespace,
        group_broken_paragraphs,
    ])
docs = loader.load()
docs[:4]

[Document(page_content='LC-MS/MS-Based Serum Protein Profiling for Identiﬁcation of Candidate Biomarkers in Pakistani Rheumatoid Arthritis Patients', metadata={'source': PosixPath('/data/tommaso/data/papers_processed/1.txt'), 'filename': '1.txt', 'file_directory': '/data/tommaso/data/papers_processed', 'filetype': 'text/plain', 'category': 'UncategorizedText'}),
 Document(page_content='Abstract: Rheumatoid arthritis is an autoimmune disorder of complex disease etiology. Currently available serological diagnostic markers lack in terms of sensitivity and speciﬁcity and thus addi- tional biomarkers are warranted for early disease diagnosis and management. We aimed to screen and compare serum proteome proﬁles of rheumatoid arthritis serotypes with healthy controls in the Pakistani population for identiﬁcation of potential disease biomarkers. Serum samples from rheumatoid arthritis patients and healthy controls were enriched for low abundance proteins using ProteoMinerTM columns. Rheumatoid

In [18]:
from langchain.document_loaders.parsers import GrobidParser
from langchain.document_loaders.generic import GenericLoader

loader = GenericLoader.from_filesystem(
    DATA_PATH / "papers",
    glob="1.pdf",
    suffixes=[".pdf"],
    parser=GrobidParser(segment_sentences=False),
)
docs = loader.load()
docs

[]

In [19]:
import spacy
spacy.require_gpu(gpu_id=1)

import spacy_transformers # needed by SpacyTextSplitter when using the en_core_web_trf pipeline
from langchain.text_splitter import SpacyTextSplitter
from itertools import chain

splitter = SpacyTextSplitter(chunk_size=1000, pipeline="en_core_web_trf")
chunks = splitter.split_documents(docs)
chunks[:5]

[]

## BioBERT

In [23]:
docs[1].page_content

'Rheumatoid factor (RF) and anti-citrullinated peptide antibodies (ACPA) are considered as the main serological markers for RA that have been included in the 2010 American College of Rheumatology (ACR)/European League against Rheumatism (EULAR) classification criteria for RA [7][8][9].Based on 2010 ACR/EULAR classification criteria for RA, clinically diagnosed RA patients can be categorized into four serotypes: (i) positive for both RF and ACPA, (ii) positive for RF and negative for ACPA, (iii) negative for RF and positive for ACPA and (iv) negative for both RF and ACPA.However, the levels of RF are also perturbed in connective tissue diseases [10] and some chronic infectious diseases such as hepatitis B and hepatitis C virus infections [11].RF is thus not a specific diagnostic marker for RA.ACPA is comparatively a more specific biomarker and two-thirds of the individuals ultimately diagnosed with RA were tested positive for ACPAs 6-10 years before diagnosis [12,13].A total of 1-3% of 

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("question-answering", model="dmis-lab/biobert-large-cased-v1.1-squad", device=1, handle_impossible_answer=True, max_seq_len=512)

In [3]:
pipe.model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(58996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), ep

In [56]:
questions = [
    "How did the authors detect protein abundances?",
    "How can RA patients be categorized?"
]
context = "\n".join([x.page_content for x in docs])

for q in questions:
    a = pipe(question=q, context=context, top_k=2)
    print(f'''
Question: {q}
Answer 1 (score: {a[0]["score"]:.3f}): '{a[0]["answer"]}'
Answer 2 (score: {a[1]["score"]:.3f}): '{a[1]["answer"]}'
''')



Question: How did the authors detect protein abundances?
Answer 1 (score: 0.121): 'Mass spectrometry (MS)-based serum proteomics'
Answer 2 (score: 0.114): 'ProgenesisQITM followed by pathway analysis'


Question: How can RA patients be categorized?
Answer 1 (score: 0.377): 'four serotypes'
Answer 2 (score: 0.320): 'into four serotypes'



In [9]:
context = "\n".join([x.page_content for x in docs])
pipe(question="How did the authors detect protein abundances?", context=context)

{'score': 0.12108789384365082,
 'start': 4854,
 'end': 4899,
 'answer': 'Mass spectrometry (MS)-based serum proteomics'}

## BioGPT

In [22]:
from langchain import HuggingFaceHub, HuggingFacePipeline

HUGGINGFACE_TOKEN = "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"

# llm = HuggingFacePipeline.from_model_id(
#     model_id="stanford-crfm/BioMedLM",
#     task="text-generation",
#     device=1,
#     model_kwargs={"temperature": 0},
# )

from langchain import PromptTemplate, LLMChain

template = """You are a useful and reliableQuestion: {question}
Context: {context}"""
prompt = PromptTemplate(template=template, input_variables=["question", "context"])
llm = HuggingFaceHub(
    repo_id="microsoft/BioGPT-Large-PubMedQA",
    model_kwargs={"temperature": 0.1, "max_length":200},
    huggingfacehub_api_token=HUGGINGFACE_TOKEN
)
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "How did the authors detect protein abundances?"
context = "\n".join([x.page_content for x in chunks])

# print(llm_chain.run(question=question, context=context))

In [44]:
docs

[Document(page_content='Rheumatoid arthritis (RA) is an autoimmune disorder of complex disease etiology.RA leads to the inflammation of joints and surrounding synovial membrane [1].The global prevalence rate of RA is 0.24% and RA has been ranked as the 42nd highest contributor to global disability [2].Diagnosing RA is a highly individualized process and is based on a combination of both clinical manifestations and serological assays.Early disease diagnosis is the key to prevent joint damage and permanent physical disability in RA [3].RA is considered to be a continuum that begins with a disease-susceptibility stage characterized by a combination of genetic risk factors.This stage proceeds through a pre-clinical stage before the development of early RA characterized by articular inflammation.Environmental and microbial triggers continuously operate across this continuum.Immune-mediated etiology associated with stromal tissue dysregulation contributes to the chronic inflammation and ulti

In [20]:
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain

HUGGINGFACE_TOKEN = "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"

llm = HuggingFaceHub(
    repo_id="tiiuae/falcon-7b-instruct",
    model_kwargs={"temperature": 0.1, "max_new_tokens": 80},
    huggingfacehub_api_token=HUGGINGFACE_TOKEN
)
question = "How did the authors detect protein abundances?"

chain_types = ["map_reduce", "refine", "map_rerank"]

chain = load_qa_chain(llm, chain_type="stuff")
print(f"""Type: stuff. {chain({"input_documents": docs[1:3], "question": question}, return_only_outputs=True)["output_text"]}""")

for t in chain_types:
    chain = load_qa_chain(llm, chain_type="stuff")
    # chain.llm_chain.prompt.template = """question: {question}. context: {context}. answer: dummy answer."""
    print(f"""Type: {t}. {chain({"input_documents": docs[1:2], "question": question}, return_only_outputs=True)["output_text"]}""")

Type: stuff. 
The authors detected protein abundances by using a technique called quantitative proteomics, which involves the use of mass spectrometry to measure the amount of protein in a sample. The authors then compared the protein abundances in the samples to determine which proteins were most abundant and which ones were present at lower levels.
Type: map_reduce. 
The authors detected protein abundances by using a technique called quantitative proteomics, which involves the use of mass spectrometry to measure the amount of protein in a sample. The authors then compared the protein abundances in the samples to determine which proteins were most abundant and which ones were present at lower levels.
Type: refine. 
The authors detected protein abundances by using a technique called quantitative proteomics, which involves the use of mass spectrometry to measure the amount of protein in a sample. The authors then compared the protein abundances in the samples to determine which proteins

In [22]:
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain

HUGGINGFACE_TOKEN = "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"

llm = HuggingFaceHub(
    repo_id="yhyhy3/med-orca-instruct-33b",
    model_kwargs={"temperature": 0.1, "max_new_tokens": 80},
    huggingfacehub_api_token=HUGGINGFACE_TOKEN
)
question = "How did the authors detect protein abundances?"

chain_types = ["map_reduce", "refine", "map_rerank"]

chain = load_qa_chain(llm, chain_type="stuff")
print(f"""Type: stuff. {chain({"input_documents": docs[1:3], "question": question}, return_only_outputs=True)["output_text"]}""")

for t in chain_types:
    chain = load_qa_chain(llm, chain_type="stuff")
    # chain.llm_chain.prompt.template = """question: {question}. context: {context}. answer: dummy answer."""
    print(f"""Type: {t}. {chain({"input_documents": docs[1:2], "question": question}, return_only_outputs=True)["output_text"]}""")

ValueError: Error raised by inference API: Model yhyhy3/med-orca-instruct-33b time out

: 

In [1]:
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain

HUGGINGFACE_TOKEN = "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"

llm = HuggingFaceHub(
    # repo_id="tiiuae/falcon-7b-instruct",
    repo_id="yhyhy3/open_llama_7b_v2_med_instruct",
    model_kwargs={"temperature": 0.1, "max_new_tokens": 80},
    huggingfacehub_api_token=HUGGINGFACE_TOKEN
)
question = "How did the authors detect protein abundances?"

chain_types = ["map_reduce", "refine", "map_rerank"]

chain = load_qa_chain(llm, chain_type="stuff")

In [5]:
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [16]:
chain.run()

ValueError: `run` supported with either positional arguments or keyword arguments, but none were provided.

In [14]:
from langchain import PromptTemplate

template = """{context}\n{question} """

prompt_template = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
)

load_qa_chain(llm, chain_type="stuff", prompt=prompt_template).llm_chain.prompt.template

'{context}\n{question} '