import osfrom openai import OpenAI# Uncomment the following line to set the environment variable in the notebook# os.environ["OPENAI_API_KEY"] = 'sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'api_key = os.getenv("OPENAI_API_KEY")if api_key: client = OpenAI(api_key=api_key)print("OpenAI client is ready")else:print("OPENAI_API_KEY environment variable not found")
OpenAI client is ready
# Set the model for all API callsOPENAI_MODEL="gpt-4o"
We construct an appropriate prompt, with some example facts, then query the model with each claim in the dataset. We ask the model to assess a claim as 'True', 'False', or 'NEE' if there is not enough evidence one way or the other.
defbuild_prompt(claim):return [ {"role": "system", "content": "I will ask you to assess a scientific claim. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence."}, {"role": "user", "content": f"""Example:Claim:0-dimensional biomaterials show inductive properties.Assessment:FalseClaim:1/2000 in UK have abnormal PrP positivity.Assessment:TrueClaim:Aspirin inhibits the production of PGE2.Assessment:FalseEnd of examples. Assess the following claim:Claim:{claim}Assessment:"""} ]defassess_claims(claims): responses = []# Query the OpenAI APIfor claim in claims: response = client.chat.completions.create(model=OPENAI_MODEL,messages=build_prompt(claim),max_tokens=3, )# Strip any punctuation or whitespace from the response responses.append(response.choices[0].message.content.strip('., '))return responses
我们从数据集中抽取了50条声明
# Let's take a look at 50 claimssamples = claim_df.sample(50)claims = samples['claim'].tolist()
We evaluate the ground-truth according to the dataset. From the dataset description, each claim is either supported or contradicted by the evidence, or else there isn't enough evidence either way.
defget_groundtruth(evidence): groundtruth = []for e in evidence:# Evidence is emptyiflen(e) ==0: groundtruth.append('NEE')else:# In this dataset, all evidence for a given claim is consistent, either SUPPORT or CONTRADICTiflist(e.values())[0][0]['label'] =='SUPPORT': groundtruth.append('True')else: groundtruth.append('False')return groundtruth
From these results we see that the LLM is strongly biased to assess claims as true, even when they are false, and also tends to assess false claims as not having enough evidence. Note that 'not enough evidence' is with respect to the model's assessment of the claim in a vacuum, without additional context.
import chromadbfrom chromadb.utils.embedding_functions import OpenAIEmbeddingFunction# We initialize an embedding function, and provide it to the collection.embedding_function = OpenAIEmbeddingFunction(api_key=os.getenv("OPENAI_API_KEY"))chroma_client = chromadb.Client() # Ephemeral by defaultscifact_corpus_collection = chroma_client.create_collection(name='scifact_corpus', embedding_function=embedding_function)
batch_size =100for i inrange(0, len(corpus_df), batch_size): batch_df = corpus_df[i:i+batch_size] scifact_corpus_collection.add(ids=batch_df['doc_id'].apply(lambda x: str(x)).tolist(), # Chroma takes string IDs.documents=(batch_df['title'] +'. '+ batch_df['abstract'].apply(lambda x: ' '.join(x))).to_list(), # We concatenate the title and abstract.metadatas=[{"structured": structured} for structured in batch_df['structured'].to_list()] # We also store the metadata, though we don't use it in this example. )
defbuild_prompt_with_context(claim, context):return [{'role': 'system', 'content': "I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence."}, {'role': 'user', 'content': f""""The evidence is the following:{' '.join(context)}Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text.Claim:{claim}Assessment:"""}]defassess_claims_with_context(claims, contexts): responses = []# Query the OpenAI APIfor claim, context inzip(claims, contexts):# If no evidence is provided, return NEEiflen(context) ==0: responses.append('NEE')continue response = client.chat.completions.create(model=OPENAI_MODEL,messages=build_prompt_with_context(claim=claim, context=context),max_tokens=3, )# Strip any punctuation or whitespace from the response responses.append(response.choices[0].message.content.strip('., '))return responses
If, after filtering on the threshold, no context documents remain, we bypass the model and simply return that there is not enough evidence.
deffilter_query_result(query_result, distance_threshold=0.25):# For each query result, retain only the documents whose distance is below the thresholdfor ids, docs, distances inzip(query_result['ids'], query_result['documents'], query_result['distances']):for i inrange(len(ids)-1, -1, -1):if distances[i] > distance_threshold: ids.pop(i) docs.pop(i) distances.pop(i)return query_result
The model now assesses many fewer claims as True or False when there is not enough evidence present. However, it also is now much more cautious, tending to label most items as not enough evidence, biasing away from certainty. Most claims are now assessed as having not enough evidence, because a large fraction of them are filtered out by the distance threshold. It's possible to tune the distance threshold to find the optimal operating point, but this can be difficult, and is dataset and embedding model dependent.
defbuild_hallucination_prompt(claim):return [{'role': 'system', 'content': """I will ask you to write an abstract for a scientific paper which supports or refutes a given claim. It should be written in scientific language, include a title. Output only one abstract, then stop. An Example: Claim: A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects. Abstract: BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria. METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09). CONCLUSIONS The increased erythrocyte count and microcytosis in children homozygous for alpha(+)-thalassaemia may contribute substantially to their protection against SMA. A lower concentration of Hb per erythrocyte and a larger population of erythrocytes may be a biologically advantageous strategy against the significant reduction in erythrocyte count that occurs during acute infection with the malaria parasite Plasmodium falciparum. This haematological profile may reduce the risk of anaemia by other Plasmodium species, as well as other causes of anaemia. Other host polymorphisms that induce an increased erythrocyte count and microcytosis may confer a similar advantage. End of example. """}, {'role': 'user', 'content': f"""" Perform the task for the following claim. Claim:{claim} Abstract: """}]defhallucinate_evidence(claims): responses = []# Query the OpenAI APIfor claim in claims: response = client.chat.completions.create(model=OPENAI_MODEL,messages=build_hallucination_prompt(claim), ) responses.append(response.choices[0].message.content)return responses
Combining HyDE with a simple distance threshold leads to a significant improvement. The model no longer biases assessing claims as True, nor toward their not being enough evidence. It also correctly assesses when there isn't enough evidence more often.