Extracting Nested Information using GPT and Gemini API

aravindmc · November 1, 2024, 8:38am

In my previous articles, I explained how to extract structured information from VAERS using GPT4 and Gemini APIs.

However, the explanation field was not very easy to use for further processing, as you could not be sure whether the sentence was from the original writeup - i.e. what is the source for the information?

Thankfully both GPT4 and Gemini APIs support the extraction of nested information, which makes it easier to solve this problem.

This is how we will modify the explanation field for GPT4


import json
import os
from openai import OpenAI
from pydantic import BaseModel
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

class Explanation(BaseModel):
    matching_sentence_if_any: str
    explanation: str


class DeathInfo(BaseModel):
    caused_by_covid19: bool
    caused_by_covid19_explanation: Explanation
    tested_positive: bool
    tested_positive_explanation: Explanation
    hospitalized: bool
    hospitalized_explanation: Explanation
    dose_number: int
    dose_number_explanation: Explanation
    vaccine_manufacturer: str
    vaccine_manufacturer_explanation: Explanation
    days_from_last_vaccination_to_symptom_onset: int
    days_from_symptom_onset_to_death: int

symptom_text = '''
    Patient stated he wasn't feeling well on January 25, 2021, wasn't eating and complained of abdominal pain.  Patient noted to have indigestion and was constipated.  Meds provided and labs ordered.  On morning of January 26, 2021, patient became weak, lethargic and hypoxic and was sent to emergency department around 0700 hours on January 26, 2021.  At approximately 1100 hours, emergency physician notified this writer that patient was not going to overcome his illness and would be placed on comfort care.  At approximately 1130 hours, this writer was notified that patient had passed away from multi-organ failure.
'''
completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system",
             "content": "You are a biomedical expert. If the value is not available, use 'Unknown' for string and -1 for int. If there is no matching sentence, leave the field empty."},
            {"role": "user",
             "content": f'''
             Writeup:
             {symptom_text}
             '''},
        ],
        response_format=DeathInfo,
)
result = completion.choices[0].message.content
response = json.loads(result)
with open('gpt_example_response.json', 'w+') as f:
    json.dump(response, f, indent=2)

Notice that we have created another class called Explanation, which we use as the base class for all the explanation fields.

Here is the resulting JSON response, which is exactly what we would expect

{
  "caused_by_covid19": false,
  "caused_by_covid19_explanation": {
    "matching_sentence_if_any": "",
    "explanation": "There is no mention of COVID-19 being related to the cause of death."
  },
  "tested_positive": false,
  "tested_positive_explanation": {
    "matching_sentence_if_any": "",
    "explanation": "The text does not mention the patient testing positive for COVID-19."
  },
  "hospitalized": true,
  "hospitalized_explanation": {
    "matching_sentence_if_any": "On morning of January 26, 2021, patient became weak, lethargic and hypoxic and was sent to emergency department around 0700 hours on January 26, 2021.",
    "explanation": "The patient was sent to the emergency department, indicating hospitalization."
  },
  "dose_number": -1,
  "dose_number_explanation": {
    "matching_sentence_if_any": "",
    "explanation": "There is no information about vaccination provided."
  },
  "vaccine_manufacturer": "Unknown",
  "vaccine_manufacturer_explanation": {
    "matching_sentence_if_any": "",
    "explanation": "There is no information about vaccination provided."
  },
  "days_from_last_vaccination_to_symptom_onset": -1,
  "days_from_symptom_onset_to_death": 1
}

Notice that this approach makes it much easier to match the explanation with the source sentence.

Let us do the same for the Gemini API

import google.generativeai as genai
import os
import json
from dotenv import load_dotenv
load_dotenv()

genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

import typing_extensions as typing

class Explanation(typing.TypedDict):
    matching_sentence_if_any: str
    explanation: str

class DeathInfo(typing.TypedDict):
    caused_by_covid19: bool
    caused_by_covid19_explanation: Explanation
    tested_positive: bool
    tested_positive_explanation: Explanation
    hospitalized: bool
    hospitalized_explanation: Explanation
    dose_number: int
    dose_number_explanation: Explanation
    vaccine_manufacturer: str
    vaccine_manufacturer_explanation: Explanation
    days_from_last_vaccination_to_symptom_onset: int
    days_from_symptom_onset_to_death: int


model = genai.GenerativeModel(
    model_name="gemini-1.5-pro-latest",
    system_instruction="You are a biomedical expert. If the value is not available, use 'Unknown' for string and -1 for int. If there is no matching sentence, leave the field empty.")
result = model.generate_content(
    '''
    Patient stated he wasn't feeling well on January 25, 2021, wasn't eating and complained of abdominal pain.  Patient noted to have indigestion and was constipated.  Meds provided and labs ordered.  On morning of January 26, 2021, patient became weak, lethargic and hypoxic and was sent to emergency department around 0700 hours on January 26, 2021.  At approximately 1100 hours, emergency physician notified this writer that patient was not going to overcome his illness and would be placed on comfort care.  At approximately 1130 hours, this writer was notified that patient had passed away from multi-organ failure.
    ''',
    generation_config=genai.GenerationConfig(
        response_mime_type="application/json", response_schema=DeathInfo
    ),
)

print(result)
response = json.loads(result.text)
with open('gemini_response.json', 'w+') as f:
    json.dump(response, f, indent=2)

And here is the response (quite similar to what we got from GPT4)

{
  "caused_by_covid19": false,
  "caused_by_covid19_explanation": {
    "explanation": "The text doesn't mention COVID-19 as the cause of death.",
    "matching_sentence_if_any": ""
  },
  "days_from_last_vaccination_to_symptom_onset": -1,
  "days_from_symptom_onset_to_death": 2,
  "dose_number": -1,
  "dose_number_explanation": {
    "explanation": "No information on vaccine doses administered was found in the text.",
    "matching_sentence_if_any": ""
  },
  "hospitalized": true,
  "hospitalized_explanation": {
    "explanation": "Patient became weak, lethargic and hypoxic and was sent to emergency department",
    "matching_sentence_if_any": "Patient became weak, lethargic and hypoxic and was sent to emergency department around 0700 hours on January 26, 2021."
  },
  "tested_positive": false,
  "tested_positive_explanation": {
    "explanation": "No information on positive test results found in provided text.",
    "matching_sentence_if_any": ""
  },
  "vaccine_manufacturer": "Unknown",
  "vaccine_manufacturer_explanation": {
    "explanation": "No information on vaccine manufacturer was found in the text.",
    "matching_sentence_if_any": ""
  }
}

You can use this approach to extract nearly any kind of information you want from your text, and forcing the LLM to provide a supporting sentence means you can even use this information to train your own custom ML models.