<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Cancer</journal-id><journal-id journal-id-type="publisher-id">cancer</journal-id><journal-id journal-id-type="index">21</journal-id><journal-title>JMIR Cancer</journal-title><abbrev-journal-title>JMIR Cancer</abbrev-journal-title><issn pub-type="epub">2369-1999</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e82971</article-id><article-id pub-id-type="doi">10.2196/82971</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Models for Breast and Cervical Cancers Communication: Mixed Methods Evaluation Study Assessing Linguistic Quality, Safety, and Accessibility</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Saha</surname><given-names>Agnik</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Churchill</surname><given-names>Victoria</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rodriguez</surname><given-names>Anny D</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kursuncu</surname><given-names>Ugur</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Idris</surname><given-names>Muhammed Y</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Computer Science, Georgia State University</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Health Education and Promotion, College of Health and Human Performance, East Carolina University</institution><addr-line>Greenville</addr-line><addr-line>NC</addr-line><country>United States</country></aff><aff id="aff3"><institution>Morehouse School of Medicine</institution><addr-line>720 Westview Drive, SW</addr-line><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff4"><institution>Institute for Insight at the J Mack Robinson College of Business, Georgia State University</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Schwartz</surname><given-names>Amy</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Balcarras</surname><given-names>Matthew</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Cahill</surname><given-names>Naomi</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chow</surname><given-names>James C L</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Qiu</surname><given-names>Mengyang</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Guo</surname><given-names>Yufei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Muhammed Y Idris, PhD, Morehouse School of Medicine, 720 Westview Drive, SW, Atlanta, GA, 30310-1496, United States, 1 404-756-8962; <email>myidris@msm.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>6</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e82971</elocation-id><history><date date-type="received"><day>25</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>22</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Agnik Saha, Victoria Churchill, Anny D Rodriguez, Ugur Kursuncu, Muhammed Y Idris. Originally published in JMIR Cancer (<ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org">https://cancer.jmir.org</ext-link>), 26.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cancer, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org/">https://cancer.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://cancer.jmir.org/2026/1/e82971"/><abstract><sec><title>Background</title><p>Effective communication about breast and cervical cancers remains a public health challenge, with widespread misinformation and barriers to cancer-related language understanding. Large language models (LLMs) offer potential for scalable health communication, yet trade-offs between quality, safety, and accessibility of general-purpose and medical-domain LLMs remain underexplored.</p></sec><sec><title>Objective</title><p>This study aimed to propose a comprehensive evaluation framework and systematically assess the performance of LLMs in generating breast and cervical cancer information, with a focus on linguistic quality, safety and trustworthiness, and communication accessibility and affectiveness.</p></sec><sec sec-type="methods"><title>Methods</title><p>This mixed methods evaluation study assessed outputs from 5 general-purpose and 3 medical LLMs using real-world breast and cervical cancer&#x2013;related questions curated from publicly available medical datasets. LLM-generated responses were evaluated in a controlled offline setting. Primary outcomes included linguistic quality (fluency, coherence, and accuracy), safety and trustworthiness (toxicity, bias, and harm potential), and communication accessibility and affectiveness (readability, empathy, and clarity). Qualitative ratings were performed by domain experts, while quantitative metrics were compared across models. Statistical analyses included Welch ANOVA to detect differences in metric scores, Games-Howell tests for pairwise comparisons, and Hedges <italic>g</italic> to assess effect sizes.</p></sec><sec sec-type="results"><title>Results</title><p>General-purpose LLMs, particularly Llama 3 and Gemma, demonstrated superior linguistic quality and affectiveness but often produced complex outputs that may limit accessibility. In contrast, medical LLMs (eg, MedAlpaca and BioMistral) generated simpler content suitable for broader audiences but scored lower in safety and empathy due to higher levels of hallucination, bias, and toxicity.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>While LLMs show promise for improving digital cancer communication, our findings reveal a trade-off between domain specialization and overall communication quality and safety. Future development of health-focused LLMs should prioritize hybrid modeling strategies to enhance trust, clarity, and clinical relevance in patient-facing tools.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>natural language processing</kwd><kwd>medical informatics</kwd><kwd>health communication</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Cancer remains a leading cause of morbidity and mortality among women in the United States, making it a critical public health issue. Breast cancer is the most commonly diagnosed cancer among women, with an estimated 310,720 new cases and 42,250 deaths projected in 2024 [<xref ref-type="bibr" rid="ref1">1</xref>]. Despite improvements in screening and treatment, disparities in cancer outcomes persist. For instance, Black women experience a 40% higher breast cancer mortality rate than White women, despite similar incidence rates, largely due to systemic inequities in screening access, delayed diagnoses, and unequal health care [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. Similarly, there were 13,360 new cases of cervical cancer in the United States in 2025 [<xref ref-type="bibr" rid="ref5">5</xref>], with Black women facing a mortality rate 200% higher than White women and Hispanic women experiencing a 51% higher incidence rate [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. These disparities are rooted in structural barriers, including financial hardship, limited geographic access, and psychological challenges [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Early cancer screening can help reduce disparities, but communicating guidelines to priority populations remains challenging [<xref ref-type="bibr" rid="ref10">10</xref>]. Emerging technologies such as large language models (LLMs) show promise for enhancing equitable, effective health communication about breast and cervical cancers by providing accessible, personalized information. Recent research has examined LLM performance in oncology and clinical contexts, showing high accuracy and completeness in patient care questions [<xref ref-type="bibr" rid="ref11">11</xref>], improved readability of cancer information with targeted prompting [<xref ref-type="bibr" rid="ref12">12</xref>], variable results for multimodal chatbot case analysis [<xref ref-type="bibr" rid="ref13">13</xref>], limited gains in diagnostic reasoning in randomized controlled trials [<xref ref-type="bibr" rid="ref14">14</xref>], and calls for careful evaluation of their use in medical research and practice [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. However, the rapid development of these models has outpaced systematic research on their real-world effectiveness, safety, and equity. Experts emphasize that before deploying such systems in cancer communication, issues of accuracy, safety, and privacy must be rigorously addressed [<xref ref-type="bibr" rid="ref19">19</xref>], as misinformation can delay diagnosis, influence harmful treatment decisions, and erode trust in health institutions [<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>To address the urgent need for effective communication tools in cancer care, this study evaluates the quality and safety of LLM-generated content related to breast and cervical cancers. Our goal is to ensure that artificial intelligence tools do not worsen disparities or cause harm. We developed a patient-centered evaluation framework assessing LLMs across 3 key areas: linguistic quality, safety and trustworthiness, and communication accessibility and affectiveness (&#x201C;affectiveness&#x201D; specifically measures the model&#x2019;s ability to express warmth, empathy, and emotional appropriateness) that have not previously been operationalized together for this domain. Using this framework, we analyzed 8 open-source models, including 5 general-purpose and 3 medical-domain LLMs, in response to real-world breast and cervical cancers&#x2013;related questions, providing a formative, methods-driven benchmark for assessing how well LLMs support accurate, empathetic, and equitable cancer communication. We report comparative performance based on quantitative and qualitative analyses.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>Our approach consists of 4 phases. First, we developed a comprehensive evaluation framework. Second, we curated a domain-specific dataset for breast and cervical cancers. Third, we selected 5 general-purpose and 3 medical LLMs to generate responses for questions in our dataset. Finally, we applied our evaluation framework to the generated responses from each model and conducted statistical analyses for the quantitative metrics and expert qualitative ratings (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the study design and evaluation pipeline. The dataset comprised breast and cervical cancers&#x2013;related questions aggregated from 6 publicly available medical sources (MedLFQA, PubMedQA, MedQA-USMLE, MedMCQA, HealthcareMagic, and iCliniq). Each prompt was submitted to 8 large language models (5 general-purpose and 3 medical-domain models), generating model-specific responses. These outputs were evaluated across 3 major dimensions&#x2014;linguistic quality, safety and trustworthiness, and communication accessibility and affectiveness&#x2014;followed by statistical analysis using Levene test, Welch ANOVA, and Games-Howell pairwise comparisons. The figure illustrates the end-to-end flow from dataset construction through model response generation and evaluation. LLMs: large language models; PET-CT: positron emission tomography&#x2013;computed tomography.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e82971_fig01.png"/></fig></sec><sec id="s2-2"><title>Evaluation Framework</title><p>This evaluation framework offers a structured approach to assessing the quality of language generated by LLMs in the context of breast and cervical cancers communication. It is designed to ensure that evaluations are consistent, thorough, and grounded in clearly defined criteria. Given the unique barriers faced by women in underserved communities, such content must be clear, trustworthy, and sensitive to diverse literacy and cultural needs [<xref ref-type="bibr" rid="ref21">21</xref>]. Our framework focuses on 3 core dimensions critical to effective patient communication: Linguistic Quality (eg, accuracy, clarity, and flow of language), Safety and Trustworthiness (eg, presence of biased, harmful, or misleading content), and Communication Accessibility and Affectiveness (eg, readability, empathy, and emotional relevance; <xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comprehensive evaluation framework for evaluating general purpose and specialized medical large language models for cancer communication.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e82971_fig02.png"/></fig></sec><sec id="s2-3"><title>Linguistic Quality</title><sec id="s2-3-1"><title>Overview</title><p>Linguistic quality refers to the clarity, accuracy, and relevance of the information generated by LLMs. In this study, we assessed how well model responses reflected reliable and well-structured cancer communication. To evaluate this, we used a combination of automated text similarity tools, designed to measure how closely model outputs matched reference content, and expert ratings of qualitative features. Linguistic evaluation in this study focuses on assessing the accuracy, relevance, and overall quality of the generated text. The primary metrics used for this purpose are ROUGE (Recall-Oriented Understudy for Gisting Evaluation), Bleurt Score, and BERTScore, which collectively encompass precision, recall, and <italic>F</italic><sub>1</sub>-score. ROUGE measures how much of the reference information is captured in the generated text. This widely used metric compares n-grams, longest common subsequences, and word pairs between the generated text and the reference text. Bleurt Score evaluates the quality of the generated text by comparing it with a reference, taking into account both semantic and syntactic aspects. It uses a pretrained model to predict a quality score based on various linguistic features. BERTScore, on the other hand, measures the semantic similarity between the generated text and the reference text. It uses contextual embeddings from BERT to compute precision, recall, and <italic>F</italic><sub>1</sub>-scores for each token in the text. We also examined the likelihood of &#x201C;hallucinations,&#x201D; meaning content generated by the model that is factually incorrect or not supported by the source material [<xref ref-type="bibr" rid="ref22">22</xref>].</p></sec><sec id="s2-3-2"><title>Hallucination</title><p>To calculate the hallucination score, we use the methodology described in the paper [<xref ref-type="bibr" rid="ref22">22</xref>]. The process begins by analyzing the generated text, focusing particularly on informative keywords such as named entities and nouns, which are most susceptible to hallucination. The model&#x2019;s uncertainty for each token is quantified through entropy, which measures the unpredictability of the model&#x2019;s predictions. Tokens with higher entropy indicate that the model is less certain about its output, suggesting a greater likelihood of generating hallucinated content. These uncertainty-based losses are then adjusted across multiple thresholds to account for varying confidence levels, normalized to ensure comparability across different models and texts, and finally aggregated to produce a hallucination score. This score, ranging from 0 to 1, is used to quantify the extent to which the generated text deviates from factual information, with higher scores indicating a greater risk of hallucination. This metric is crucial for identifying and mitigating the generation of inaccurate or fabricated content, especially in domains where factual accuracy is critical, such as medical text generation.</p><p>Expert reviewers rated each response on 4 communication-related criteria: accuracy (clinical correctness), coherence (logical flow and consistency), use of jargon (degree of unnecessarily technical language), and understanding and reasoning (the model&#x2019;s ability to interpret medical questions and provide appropriate, well-explained answers). Together, these measures reflect how well a model can produce trustworthy, patient-relevant cancer information.</p></sec></sec><sec id="s2-4"><title>Safety and Trustworthiness</title><sec id="s2-4-1"><title>Overview</title><p>Safety and trustworthiness refer to whether the language generated by LLMs is free from harmful, biased, or misleading content&#x2014;factors that are essential for patient trust and effective communication. We evaluated 3 key risks: toxicity (language that is offensive, threatening, or emotionally harmful), gender bias, and racial bias. Toxicity was measured using an established automated tool that detects potentially harmful or inappropriate language [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. While breast cancer primarily affects women, men can also be diagnosed with the disease [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Therefore, it was important to assess whether model responses unintentionally reinforced gender stereotypes or excluded male patients. We measured gender bias using a tool that quantifies how strongly language is associated with one gender over another, where higher scores indicate greater imbalance [<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec><sec id="s2-4-2"><title>Gender Bias</title><p>The Genbit score is a metric designed to quantify gender bias in language models and datasets. Specifically, the average bias conditional absolute score is a widely used measure in this context. This score calculates the average of the absolute values of the log ratio of the probabilities of a word occurring, given a male context versus a female context across all words in a dataset. Formally, this is represented as the average over all words.</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">b</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">s</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>w</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>w</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>w</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where &#x1D443; (&#x1D464;|&#x1D45A;) and &#x1D443; (&#x1D464;|&#x1D453;) denote the conditional probabilities of the word &#x1D464;<italic>,</italic> given male and female contexts, respectively. The intuition behind this metric is to provide a robust measure of bias by examining the differences in word associations with male and female contexts. By taking the absolute value of the log ratio, the metric ensures that biases favoring either gender are accounted for, without canceling each other out. This approach helps in identifying systemic biases where certain words might be more strongly associated with one gender over the other, which can perpetuate stereotypes and unequal representations in language models. This score thus serves as a key indicator for evaluating and addressing gender bias in natural language processing systems, ensuring more balanced and fair representations in artificial intelligence&#x2013;generated text.</p><p>A higher Genbit score indicates a greater degree of gender bias in the dataset. This means that there are more significant differences in the conditional probabilities of words occurring in male versus female contexts, reflecting stronger gender associations. The range of Genbit score values can vary depending on the dataset and the level of bias present. Generally, lower scores indicate minimal bias, whereas higher scores suggest more pronounced bias.</p><p>To evaluate racial bias, we modified sample prompts to include racial or ethnic (and related) contexts (eg, &#x201C;Black woman&#x201D; and &#x201C;Hispanic patient&#x201D;) and examined whether responses changed inappropriately [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. Concretely, prompts took the form &#x201C;{question}. I am {Male/Female},&#x201D; &#x201C;{question}. I am {Caucasian/African American/Asian/American Indian/Native Hawaiian},&#x201D; &#x201C;{question}. I am {Hispanic/Non-Hispanic},&#x201D; and &#x201C;{question}. I am a {Native/Non-native} English speaker.&#x201D; We randomly sampled 50 base questions from the study corpus and, for each question, generated 1 response per model per demographic variant under a fixed instruction and decoding settings. We then computed similarity between the baseline response (no demographic cue) and each contextualized response. Higher similarity indicates smaller demographic-induced changes (interpreted as lower bias), whereas lower similarity flags potential sensitivity to the demographic cue. In addition to this automated measure, expert review examined content for harm and trust or confidence dimensions to capture clinically relevant disparities that may not be reflected by surface-level similarity.</p><p>Beyond these automated measures, we also conducted expert assessments focused on 2 dimensions: harm, referring to content that could be emotionally distressing or medically misleading, and trust and confidence, reflecting how well the tone and framing of the response foster user trust and decision-making support [<xref ref-type="bibr" rid="ref31">31</xref>]. These combined assessments offer a more comprehensive view of how safe and equitable LLM-generated cancer communication may be for diverse patient populations. In addition to explicit bias and toxicity checks, the expert review dimensions of accuracy, harm, and trust also capture broader safety risks such as outdated or omitted clinical details and subtle misinformation, ensuring that the evaluation encompasses both factual correctness and communicative reliability.</p></sec></sec><sec id="s2-5"><title>Communication Accessibility and Affectiveness</title><p>Communication accessibility and affectiveness describe how understandable, emotionally supportive, and actionable the generated content is for patients [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. To assess accessibility, we applied a set of widely used readability formulas that estimate how easy or difficult a passage is to read based on sentence structure and vocabulary [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. These metrics helped us determine whether the content was appropriate for a broad audience, including individuals with lower health literacy. To evaluate emotional tone, we used a scoring method that estimates how well the model&#x2019;s responses reflect empathy and emotional alignment with patients, based on patterns in real-world counseling conversations [<xref ref-type="bibr" rid="ref40">40</xref>]. In addition to these automated measures, expert reviewers evaluated several qualitative aspects of the content. Clarity and empathy assessed whether the language was both understandable and compassionate. Compassion specifically reflected emotional sensitivity and supportiveness. Cue to action measured whether the content encouraged patients to take meaningful next steps, such as scheduling a screening. Domain relevance ensured that the responses stayed focused on breast or cervical cancer rather than veering into unrelated information. Finally, usability and acceptability considered how practical and appropriate the content was for patients, particularly in community or clinical health communication settings.</p></sec><sec id="s2-6"><title>Dataset</title><p>We curated a domain-specific dataset for evaluating LLMs on breast and cervical cancers communication by filtering 5 publicly available medical datasets using the keywords &#x201C;breast cancer&#x201D; and &#x201C;cervical cancer.&#x201D; PubMedQA [<xref ref-type="bibr" rid="ref41">41</xref>], comprising biomedical Q&#x0026;A pairs from PubMed abstracts, contributed 3310 filtered instances. MedQA-USMLE [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], based on USMLE, provided 141 instances, while MedMCQA [<xref ref-type="bibr" rid="ref42">42</xref>], covering Indian medical entrance examinations, contributed 278 cases. From MedLFQA [<xref ref-type="bibr" rid="ref44">44</xref>], which aggregates consumer health queries from sources such as LiveQA [<xref ref-type="bibr" rid="ref45">45</xref>], MedicationQA [<xref ref-type="bibr" rid="ref46">46</xref>], HealthSearchQA [<xref ref-type="bibr" rid="ref41">41</xref>], and K- QA [<xref ref-type="bibr" rid="ref47">47</xref>], we extracted 36 relevant cases. Additionally, HealthcareMagic and iCliniq [<xref ref-type="bibr" rid="ref48">48</xref>], both user-generated Q&#x0026;A platforms, added 835 and 43 instances, respectively. The final dataset comprised 4643 cases, offering a diverse and clinically relevant foundation to rigorously assess LLMs&#x2019; performance in generating accurate, safe, and patient-centered cancer information.</p></sec><sec id="s2-7"><title>Experimental Setup: Selected LLMs</title><p>We selected both general-purpose and specialized medical LLMs to assess their effectiveness in generating accurate breast and cervical cancers information, aiming to compare general-purpose and specialized medical LLMs for their performance in the 3 main evaluation categories. We deliberately evaluated sub-8B, open-source models to match the practical and methodological goals of this work. First, they are replicable and deployable on a single commodity graphics processing unit (or central processing unit with batching), which allows academic, clinical, and low-resource teams to reproduce results and run models on-premises or at the edge without costly infrastructure. Second, open weights enable fine-tuning for breast and cervical cancers communication, safety auditing, and governance (data control, protected health information protection, and versioning); by contrast, closed systems do not generally permit domain-specific fine-tuning or weight-level inspection, limiting adaptation to clinical communication needs. Third, focusing below 8B reduces confounding from pure scale effects, clarifying the contribution of prompting, alignment, and evaluation methods rather than parameter count alone. Finally, sub-8B models align with realistic deployment constraints (latency, memory footprint, and energy cost) in clinics and community settings, making the findings directly actionable for stakeholders who cannot rely on large-hosted models.</p><p>We evaluated 5 general-purpose LLMs: Vicuna 7B, Alpaca 7B [<xref ref-type="bibr" rid="ref49">49</xref>], Llama 3 8B [<xref ref-type="bibr" rid="ref50">50</xref>], Mistral 7B [<xref ref-type="bibr" rid="ref51">51</xref>], and Gemma 7B [<xref ref-type="bibr" rid="ref52">52</xref>], selected for their state-of-the-art performance in generating content [<xref ref-type="bibr" rid="ref53">53</xref>] and training methodologies. We included 3 specialized medical LLMs: MedAlpaca [<xref ref-type="bibr" rid="ref54">54</xref>], BioMistral 7B [<xref ref-type="bibr" rid="ref55">55</xref>], and Meditron [<xref ref-type="bibr" rid="ref56">56</xref>] to assess domain-specific performance, particularly for breast and cervical cancers [<xref ref-type="bibr" rid="ref57">57</xref>].</p><p>For all models except LLaMa3, we used the following parameters: a maximum of 512 new tokens, sampling enabled with a top_k value of 50, a top_p value of 0.9, a temperature of 1.0, and the &#x201C;pad_token_id&#x201D; set to the tokenizer&#x2019;s end-of-sequence token. These parameters were chosen for several reasons. The maximum of 512 new tokens ensures that responses are sufficiently detailed without being excessively long. Sampling with a top_k value of 50 allows the model to consider a broad range of possible next tokens, promoting diversity in responses while still focusing on the most probable options. The top_p value of 0.9 for nucleus sampling ensures that only the most likely tokens are considered, balancing randomness and coherence in the generated text. A temperature of 1.0 maintains a moderate level of randomness, preventing the responses from being too deterministic or too chaotic. Setting the pad_token_id to the tokenizer&#x2019;s end-of-sequence token ensures proper sequence padding and termination. For LLaMa3, we used a different set of parameters to optimize its performance. The maximum number of new tokens was also set to 512. However, sampling was disabled (do_sample set to false), making the predictions deterministic and ensuring consistent outputs for the same input. The temperature was set to 0.0 to further enforce deterministic behavior, and the top_p value remained at 0.9 for nucleus sampling.</p></sec><sec id="s2-8"><title>Data Analysis: Statistical Analysis of Quantitative Metrics</title><p>We applied Welch ANOVA to each evaluation metric to test whether there were statistically significant differences in performance across the 8 LLMs, suitable for datasets with unequal variances and sample sizes, conditions consistent with our experimental setting [<xref ref-type="bibr" rid="ref58">58</xref>]. For metrics that showed significance, Games-Howell post hoc tests were used to perform pairwise comparisons between every unique pair of LLMs without assuming homogeneity of variance or equal sample sizes for this multimodel and multimetric comparison [<xref ref-type="bibr" rid="ref59">59</xref>]. For each LLM pair, we computed Hedges <italic>g</italic> to quantify the effect size and direction of difference [<xref ref-type="bibr" rid="ref60">60</xref>]. Rankings were adjusted accordingly: if the effect size was positive (indicating better performance), the first model&#x2019;s rank increased and the second model&#x2019;s rank decreased, and vice versa. Statistical significance was set at <italic>P</italic>&#x003C;.05, with both <italic>P</italic> values and effect sizes used to assess statistical and practical significance jointly.</p></sec><sec id="s2-9"><title>Coding of Qualitative Data and Evaluation</title><p>Two domain experts in health communication and breast or cervical cancer (VC and ADR) independently evaluated model outputs using a structured rubric aligned with the 3 core evaluation categories. We assembled an 8&#x00D7;50 qualitative dataset (400 responses) via stratified random sampling from the 4643-item corpus: sampling was proportional to source pools (PubMedQA, MedQA-USMLE, MedMCQA, MedLFQA, HealthcareMagic, and iCliniq) and further stratified by cancer type (breast and cervical), topic (screening or eligibility, diagnosis or prognosis, and treatment options or risks and survivorship), and question style (factual, procedural, and counseling). Within each stratum, items were selected at random, with minimum guaranteed counts for rare but clinically salient strata to ensure coverage.</p><p>Raters were blinded to model identity, item order was randomized, and scoring began with a calibration block using rubric anchors, with scheduled breaks to mitigate order and fatigue effects. Responses were rated on multiple qualitative criteria (eg, accuracy, harm, empathy, trust, clarity, and actionability) on a 3-point Likert scale. Scores from each expert were averaged per criterion and treated as interval data, consistent with common practice in psychometrics or health communication research [<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>].</p><p>To enhance transparency, a public companion website hosts the codebook questions, per-model responses, stratum labels, and the rating interface used in the study, allowing readers to inspect exactly what was evaluated. The 50-prompt target per model was set a priori for precise estimation of model means; pilot resampling indicated saturation of rank orderings and false discovery rate&#x2013;controlled pairwise conclusions beyond approximately 40&#x2010;50 items, yielding a qualitative view that is feasible, reliable, and representative of the full corpus. Responses were rated on multiple qualitative criteria in each category (eg, accuracy, harm, empathy, trust, clarity, and actionability) using a 3-point Likert scale. Scores from each expert were averaged for each criterion item (eg, average score for accuracy and average score for empathy) and treated as interval data, consistent with standard practices in psychometrics and health communication research [<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. This approach was selected for aligning with the study&#x2019;s focus on category-level evaluation, reducing item-level variability, and emphasizing consistent rating patterns across categories. Interrater reliability was assessed using weighted Cohen &#x03BA; (&#x03BA;<sub>w</sub>), with quadratic weights applied to penalize larger disagreements more heavily [<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>]. Descriptive statistics were reported by model and category, providing a rigorous assessment of model performance. Additional details, including a sample question from the curated dataset, representative responses from the 8 LLMs, qualitative metric definitions, and supplementary metric tables, are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-10"><title>Ethical Considerations</title><p>This study did not involve human participants, patient data, or any personally identifiable information. Therefore, ethical approval and informed consent were not required in accordance with institutional and national guidelines. All analyses were conducted on publicly available data and LLM outputs following JMIR&#x2019;s ethical research standards.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p><xref ref-type="table" rid="table1">Table 1</xref> presents a summary on the performance of 8 LLMs, 5 general-purpose models, and 3 medical-domain models, across 3 key evaluation categories: Linguistic Quality, Safety and Trustworthiness, and Communication Accessibility and Affectiveness. Each cell in the table reports the model&#x2019;s rank (1=best) and corresponding actual relative score within parentheses used to compute the rank.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Table ranking 8 large language models across 3 dimensions: Linguistic Quality, Trustworthiness, and Accessibility<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimensions and metrics</td><td align="left" valign="bottom" colspan="5">General-purpose LLMs<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="3">Medical LLMs</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Llama3</td><td align="left" valign="bottom">Gemma</td><td align="left" valign="bottom">Alpaca</td><td align="left" valign="bottom">Mistral</td><td align="left" valign="bottom">Vicuna</td><td align="left" valign="bottom">MedAlpaca</td><td align="left" valign="bottom">BioMistral</td><td align="left" valign="bottom">Meditron</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Linguistic Quality</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEURT Score</td><td align="left" valign="top">1 (7)</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">5 (&#x2212;2)</td><td align="left" valign="top">5 (&#x2212;2)</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">7 (&#x2212;5)</td><td align="left" valign="top">8 (&#x2212;7)</td><td align="left" valign="top">4 (1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BertScore Precision</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">7 (&#x2212;6)</td><td align="left" valign="top">5 (&#x2212;2)</td><td align="left" valign="top">4 (1)</td><td align="left" valign="top">5 (&#x2212;2)</td><td align="left" valign="top">1 (7)</td><td align="left" valign="top">7 (&#x2212;6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BertScore Recall</td><td align="left" valign="top">1 (7)</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">6 (&#x2212;4)</td><td align="left" valign="top">3 (1)</td><td align="left" valign="top">3 (1)</td><td align="left" valign="top">6 (&#x2212;4)</td><td align="left" valign="top">8 (&#x2212;7)</td><td align="left" valign="top">3 (1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BertScore F1</td><td align="left" valign="top">1 (7)</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">8 (&#x2212;7)</td><td align="left" valign="top">5 (&#x2212;1)</td><td align="left" valign="top">3 (2)</td><td align="left" valign="top">6 (&#x2212;4)</td><td align="left" valign="top">3 (2)</td><td align="left" valign="top">6 (&#x2212;4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Rouge-1</td><td align="left" valign="top">1 (6)</td><td align="left" valign="top">1 (6)</td><td align="left" valign="top">6 (&#x2212;5)</td><td align="left" valign="top">4 (1)</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">6 (&#x2212;5)</td><td align="left" valign="top">6 (&#x2212;5)</td><td align="left" valign="top">5 (&#x2212;1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Rouge-2</td><td align="left" valign="top">1 (7)</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">5 (&#x2212;4)</td><td align="left" valign="top">4 (1)</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">5 (&#x2212;4)</td><td align="left" valign="top">5 (&#x2212;4)</td><td align="left" valign="top">5 (&#x2212;4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Rouge-L</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">1 (7)</td><td align="left" valign="top">6 (&#x2212;5)</td><td align="left" valign="top">4 (1)</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">6 (&#x2212;5)</td><td align="left" valign="top">6 (&#x2212;5)</td><td align="left" valign="top">5 (&#x2212;1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hallucination Score</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">7 (6)</td><td align="left" valign="top">7 (6)</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">6 (3)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">4 (0)</td></tr><tr><td align="left" valign="top" colspan="9">Safety and Trustworthiness</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gender Bias</td><td align="left" valign="top">7 (5)</td><td align="left" valign="top">8 (7)</td><td align="left" valign="top">2 (&#x2212;5)</td><td align="left" valign="top">6 (3)</td><td align="left" valign="top">5 (1)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">4 (&#x2212;1)</td><td align="left" valign="top">3 (&#x2212;3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Toxicity Score</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">8 (7)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">6 (3)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">7 (5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Severe Toxicity</td><td align="left" valign="top">1 (&#x2212;4)</td><td align="left" valign="top">7 (6)</td><td align="left" valign="top">1 (&#x2212;4)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">1 (&#x2212;4)</td><td align="left" valign="top">1 (&#x2212;4)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">7 (6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Identity Attack</td><td align="left" valign="top">3 (&#x2212;2)</td><td align="left" valign="top">7 (6)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">6 (3)</td><td align="left" valign="top">4 (&#x2212;1)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">4 (&#x2212;1)</td><td align="left" valign="top">7 (6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Insult</td><td align="left" valign="top">6 (5)</td><td align="left" valign="top">6 (5)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">5 (1)</td><td align="left" valign="top">3 (&#x2212;2)</td><td align="left" valign="top">2 (&#x2212;5)</td><td align="left" valign="top">3 (&#x2212;2)</td><td align="left" valign="top">6 (5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Profanity</td><td align="left" valign="top">1 (&#x2212;6)</td><td align="left" valign="top">7 (7)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">7 (7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Threat</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">5 (3)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">5 (3)</td><td align="left" valign="top">4 (&#x2212;2)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">5 (3)</td><td align="left" valign="top">5 (3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sexually Explicit</td><td align="left" valign="top">1 (&#x2212;6)</td><td align="left" valign="top">7 (6)</td><td align="left" valign="top">2 (&#x2212;3)</td><td align="left" valign="top">6 (0)</td><td align="left" valign="top">3 (&#x2212;1)</td><td align="left" valign="top">3 (&#x2212;1)</td><td align="left" valign="top">3 (&#x2212;1)</td><td align="left" valign="top">7 (6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Flirtation</td><td align="left" valign="top">2 (&#x2212;5)</td><td align="left" valign="top">4 (&#x2212;1)</td><td align="left" valign="top">8 (7)</td><td align="left" valign="top">7 (3)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">6 (1)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">4 (&#x2212;1)</td></tr><tr><td align="left" valign="top" colspan="8">Communication Accessibility and Affectiveness</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Flesch Reading Ease</td><td align="left" valign="top">8 (&#x2212;6)</td><td align="left" valign="top">6 (&#x2212;4)</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">4 (1)</td><td align="left" valign="top">5 (&#x2212;2)</td><td align="left" valign="top">3 (4)</td><td align="left" valign="top">1 (6)</td><td align="left" valign="top">6 (&#x2212;4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Flesch-Kincaid Grade Level</td><td align="left" valign="top">8 (6)</td><td align="left" valign="top">6 (4)</td><td align="left" valign="top">1 (&#x2212;5)</td><td align="left" valign="top">4 (&#x2212;1)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">1 (&#x2212;5)</td><td align="left" valign="top">1 (&#x2212;5)</td><td align="left" valign="top">6 (4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gunning Fog Index</td><td align="left" valign="top">8 (7)</td><td align="left" valign="top">6 (3)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">4 (&#x2212;1)</td><td align="left" valign="top">7 (4)</td><td align="left" valign="top">2 (&#x2212;5)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">5 (2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Smog Index</td><td align="left" valign="top">8 (7)</td><td align="left" valign="top">7 (5)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">2 (&#x2212;5)</td><td align="left" valign="top">1 (&#x2212;7)</td><td align="left" valign="top">6 (3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Automated Readability Index</td><td align="left" valign="top">8 (4)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">2 (&#x2212;4)</td><td align="left" valign="top">3 (&#x2212;3)</td><td align="left" valign="top">6 (3)</td><td align="left" valign="top">1 (&#x2212;5)</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">6 (3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Coleman Liau Index</td><td align="left" valign="top">5 (3)</td><td align="left" valign="top">5 (3)</td><td align="left" valign="top">1 (&#x2212;5)</td><td align="left" valign="top">3 (&#x2212;2)</td><td align="left" valign="top">5 (3)</td><td align="left" valign="top">1 (&#x2212;5)</td><td align="left" valign="top">4 (0)</td><td align="left" valign="top">5 (3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reflection Score</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">2 (4)</td><td align="left" valign="top">7 (&#x2212;3)</td><td align="left" valign="top">1 (6)</td><td align="left" valign="top">5 (0)</td><td align="left" valign="top">3 (3)</td><td align="left" valign="top">8 (&#x2212;4)</td><td align="left" valign="top">5 (0)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>For Linguistic Quality, metrics such as BERTScore (Precision and Recall, F1), BLEURT Score, and ROUGE (1, 2, L) indicate higher is better, while for Hallucination score indicates lower is better. For Trustworthiness metrics (eg, Toxicity), lower values are better. In Accessibility, higher values are better for Flesch Reading Ease and Reflection score, while lower values are better for Flesch-Kincaid Grade Level, Coleman-Liau Index, and Gunning Fog Index. Each cell shows the rank (score). </p></fn><fn id="table1fn2"><p><sup>b</sup>LLMs: large language models.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Performance of LLMs in Linguistic Quality</title><sec id="s3-2-1"><title>Quantitative Evaluation</title><p>Our analysis revealed significant differences in BLEURT, BERTScore, and ROUGE across models, indicating distinct strengths and weaknesses in linguistic fluency and content quality. As shown in <xref ref-type="table" rid="table1">Table 1</xref>, post hoc analysis identified general LLMs, specifically Llama 3, outperforming medical LLMs, based on BLEURT (0.41), BERTScore Recall (0.86), and ROUGE- 1 (0.51), indicating higher linguistic quality, fluency, and relevance. Among medical LLMs, BioMistral demonstrated higher precision (BERTScore Precision and F1: 0.82), highlighting its capability for accurate, domain-specific content generation. However, general LLMs, including Alpaca and Mistral, showed elevated hallucination scores (both at 0.57), suggesting a trade-off between fluency and factuality. Llama 3 had the lowest hallucination score among general LLMs, demonstrating its robustness in factual accuracy.</p></sec><sec id="s3-2-2"><title>Evaluation of Qualitative Content</title><p>Assessments of the qualitative content (<xref ref-type="table" rid="table2">Table 2</xref>) revealed moderate to near-perfect interrater agreement, especially for coherence (&#x03BA;<sub>w</sub>=0.82) and accuracy (&#x03BA;<sub>w</sub>=0.60). Llama 3 scored the highest across all linguistic criteria items, particularly in reasoning (2.94) and accuracy (2.92), reflecting strong factual consistency and logical structure. In contrast, MedAlpaca and Meditron scored lowest, with Meditron exhibiting poor coherence (1.13) and excessive jargon (1.57), suggesting limitations in clarity and accessibility. Alpaca and Mistral performed moderately but lagged in reasoning and accuracy. These findings indicate that general-purpose models, particularly Llama 3 and Gemma, outperformed specialized medical LLMs in generating clear and accurate cancer-related communication content.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Qualitative evaluation of general-purpose and medical large language models across linguistic quality, safety or trustworthiness, and communication or accessibility dimensions<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metrics</td><td align="left" valign="bottom" colspan="5">General-purpose LLMs<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="3">Medical LLMs</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Llama3</td><td align="left" valign="bottom">Gemma</td><td align="left" valign="bottom">Alpaca</td><td align="left" valign="bottom">Mistral</td><td align="left" valign="bottom">Vicuna</td><td align="left" valign="bottom">MedAlpaca</td><td align="left" valign="bottom">BioMistral</td><td align="left" valign="bottom">Meditron</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Linguistic Quality</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">2.92</td><td align="left" valign="top">2.70</td><td align="left" valign="top">1.53</td><td align="left" valign="top">1.48</td><td align="left" valign="top">2.11</td><td align="left" valign="top">1.48</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.35</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Coherence</td><td align="left" valign="top">2.81</td><td align="left" valign="top">2.66</td><td align="left" valign="top">1.36</td><td align="left" valign="top">1.57</td><td align="left" valign="top">1.94</td><td align="left" valign="top">1.43</td><td align="left" valign="top">1.12</td><td align="left" valign="top">1.13</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Jargon</td><td align="left" valign="top">2.11</td><td align="left" valign="top">1.96</td><td align="left" valign="top">1.98</td><td align="left" valign="top">1.75</td><td align="left" valign="top">1.92</td><td align="left" valign="top">1.74</td><td align="left" valign="top">1.49</td><td align="left" valign="top">1.57</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Understanding</td><td align="left" valign="top">2.94</td><td align="left" valign="top">2.67</td><td align="left" valign="top">1.60</td><td align="left" valign="top">1.66</td><td align="left" valign="top">2.04</td><td align="left" valign="top">1.58</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.40</td></tr><tr><td align="left" valign="top" colspan="9">Safety and Trustworthiness</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harm</td><td align="left" valign="top">2.96</td><td align="left" valign="top">2.76</td><td align="left" valign="top">1.62</td><td align="left" valign="top">1.54</td><td align="left" valign="top">2.11</td><td align="left" valign="top">1.51</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.42</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Trust and Confidence</td><td align="left" valign="top">2.93</td><td align="left" valign="top">2.64</td><td align="left" valign="top">1.56</td><td align="left" valign="top">1.64</td><td align="left" valign="top">2.06</td><td align="left" valign="top">1.53</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.43</td></tr><tr><td align="left" valign="top" colspan="9">Communication Accessibility and Affectiveness</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clarity &#x0026; Empathy</td><td align="left" valign="top">2.89</td><td align="left" valign="top">2.64</td><td align="left" valign="top">1.70</td><td align="left" valign="top">1.59</td><td align="left" valign="top">2.05</td><td align="left" valign="top">1.59</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.41</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Compassion</td><td align="left" valign="top">2.86</td><td align="left" valign="top">2.21</td><td align="left" valign="top">1.72</td><td align="left" valign="top">1.61</td><td align="left" valign="top">2.00</td><td align="left" valign="top">1.67</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.56</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cue to Action</td><td align="left" valign="top">2.82</td><td align="left" valign="top">2.28</td><td align="left" valign="top">1.58</td><td align="left" valign="top">1.53</td><td align="left" valign="top">1.93</td><td align="left" valign="top">1.50</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.40</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Domain Relevance</td><td align="left" valign="top">2.94</td><td align="left" valign="top">2.68</td><td align="left" valign="top">1.60</td><td align="left" valign="top">1.68</td><td align="left" valign="top">2.09</td><td align="left" valign="top">1.60</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.43</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Usability/Acceptability</td><td align="left" valign="top">2.88</td><td align="left" valign="top">2.55</td><td align="left" valign="top">1.48</td><td align="left" valign="top">1.44</td><td align="left" valign="top">1.99</td><td align="left" valign="top">1.44</td><td align="left" valign="top">1.18</td><td align="left" valign="top">1.32</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>All scores are mean ratings on a 1&#x2010;3 Likert scale (1=disagree, 2=Neutral, and 3=agree). </p></fn><fn id="table2fn2"><p><sup>b</sup>LLMs: large language models.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s3-3"><title>Performance of LLMs in Safety and Trustworthiness: Quantitative Evaluation</title><p><xref ref-type="table" rid="table1">Table 1</xref> presents toxicity and bias metrics across models. While all LLMs demonstrated low levels of toxicity, MedAlpaca had relatively the lowest toxicity (0.024), and Meditron had the highest (eg, identity attack: 0.0087). Among general-purpose models, Gemma exhibited the highest toxicity (0.038), whereas Llama 3 showed comparatively lower toxicity (0.033). To assess broader demographic biases, including race and gender, we applied in-context impersonation for racial bias and GenBit scoring for gender bias, following prior work [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. MedAlpaca showed the lowest gender bias (0.903), and Gemma the highest (1.498), followed by Llama 3 (1.43). On racial bias, <xref ref-type="fig" rid="figure3">Figure 3</xref> presents similarity scores from sentenceBERT [<xref ref-type="bibr" rid="ref63">63</xref>], showing how well LLMs maintain consistent high performance with low variability regardless of demographic context, as higher scores indicate lower bias. Llama 3 and Gemma consistently maintained higher similarity with low variability, suggesting more equitable treatment across racial identities. In contrast, Alpaca and BioMistral showed lower similarity and greater variability, reflecting potential vulnerabilities in demographic sensitivity.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Similarity scores between responses without context and responses with context (eg, African American, Female, and Hispanic). This shows that general-purpose models such as Llama 3 and Gemma consistently maintain high similarity scores across demographic contexts, indicating lower bias and stronger demographic consistency. In contrast, medical large language models (LLMs) such as BioMistral and MedAlpaca display greater variability and lower similarity scores, especially across race, ethnicity, and language background. This suggests that general-purpose LLMs are currently more robust in generating equitable responses across diverse populations, while medical LLMs may require further tuning for demographic fairness.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e82971_fig03.png"/></fig></sec><sec id="s3-4"><title>Evaluation of Qualitative Content</title><p>Qualitative assessments of safety and trustworthiness, based on expert annotations, revealed moderate agreement on perceived harm (&#x03BA;<sub>w</sub>=0.59) and trust and confidence (&#x03BA;<sub>w</sub>=0.59). As shown in <xref ref-type="table" rid="table2">Table 2</xref>, Llama 3 received the highest ratings for both harm reduction (2.96) and trustworthiness (2.93), aligning closely with responsible health communication standards. Vicuna and Gemma also performed well in these criteria, while MedAlpaca and Meditron scored lowest, despite being trained on medical content. Alpaca and Mistral showed moderate performance. These results suggest that general-purpose LLMs, particularly Llama 3, currently provide more reliable, safe, and trustworthy outputs than many specialized medical LLMs, highlighting a critical gap in the tuning and evaluation of domain-specific systems.</p></sec><sec id="s3-5"><title>Performance of LLMs in Communication Accessibility and Affectiveness</title><sec id="s3-5-1"><title>Quantitative Evaluation</title><p>This category evaluates readability and emotional resonance, which are critical for patient-centered communication. As shown in <xref ref-type="table" rid="table1">Table 1</xref>, Alpaca and MedAlpaca produced the most accessible content, with Flesch Reading Ease scores above 59 and Flesch-Kincaid Grade Levels near 8.0, aligning with established guidelines for public health materials. BioMistral also performed well, achieving the highest Flesch Reading Ease (63.73) and lowest SMOG Index (5.84), although it had moderately high complexity scores on other indices. In contrast, Llama 3 and Gemma generated significantly more complex responses, with Flesch scores below 40 and Grade Levels above 12, making them more appropriate for high-literacy audiences. Meditron and Vicuna produced denser text with lower readability and greater complexity. These results suggest that Alpaca and MedAlpaca are well suited for patient-facing communication, while general-purpose models, such as Llama 3, may require further simplification to reach broader public audiences.</p></sec><sec id="s3-5-2"><title>Evaluation of Qualitative Content</title><p>Expert ratings showed moderate to substantial agreement across clarity and empathy (&#x03BA;<sub>w</sub>=0.57), domain relevance (&#x03BA;<sub>w</sub>=0.62), and usability and applicability (&#x03BA;<sub>w</sub>=0.53). As summarized in <xref ref-type="table" rid="table2">Table 2</xref>, Llama 3 consistently outperformed across all 5 affective dimensions, including clarity and empathy (2.89), compassion (2.86), cue to action (2.82), domain relevance (2.94), and usability (2.88), indicating high-quality, actionable, and emotionally resonant communication. Gemma and Vicuna followed with strong scores in domain relevance and usability. In contrast, MedAlpaca and Meditron underperformed, particularly in usability and motivational content, suggesting limitations in generating patient-centered outputs. Alpaca and Mistral scored moderately, with strengths in compassion but weaker usability. Overall, general-purpose LLMs, especially Llama 3, demonstrated stronger affective and communicative performance than medical LLMs.</p></sec></sec><sec id="s3-6"><title>Statistical Differences and Effect Sizes Across Models</title><p>We used Welch 1-way ANOVA to test whether model performance differed significantly across all evaluation metrics (<xref ref-type="table" rid="table3">Table 3</xref>). Results show clear and statistically significant overall differences among the 8 models for every metric. Readability and content-quality measures&#x2014;such as BLEURT, BERTScore, ROUGE, and classical readability indices&#x2014;exhibited large effect sizes (&#x03B5;&#x00B2;), indicating substantial separation in linguistic quality across models. The hallucination metric likewise showed a pronounced overall effect, suggesting marked variation in factual stability. In contrast, safety outcomes (toxicity, insult, identity attack, profanity, threat, sexually explicit content, and severe toxicity) demonstrated smaller but statistically reliable effects, reflecting narrower yet consistent performance gaps among models. Collectively, these findings indicate that model choice most strongly influences clarity, coherence, and factual accuracy, with more moderate but still credible differences observed in safety-related behavior.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Welch ANOVA across models for each evaluation metric<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metrics</td><td align="left" valign="bottom"><italic>F</italic> test (<italic>df</italic>)</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">&#x03B5;&#x00B2;</td></tr></thead><tbody><tr><td align="left" valign="top">BLEURT Score</td><td align="left" valign="top">1612.988 (7, 11831.33)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.839</td></tr><tr><td align="left" valign="top">BertScore Precision</td><td align="left" valign="top">1091.289 (7, 12154.17)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.576</td></tr><tr><td align="left" valign="top">BertScore Recall</td><td align="left" valign="top">438.361 (7, 12013.52)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.246</td></tr><tr><td align="left" valign="top">BertScore F1</td><td align="left" valign="top">639.625 (7, 12024.49)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.353</td></tr><tr><td align="left" valign="top">Rouge-1</td><td align="left" valign="top">361.108 (7, 11904.96)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.205</td></tr><tr><td align="left" valign="top">Rouge-2</td><td align="left" valign="top">223.332 (7, 11732.70)</td><td align="left" valign="top">0.00E-02</td><td align="left" valign="top">0.13</td></tr><tr><td align="left" valign="top">Rouge-L</td><td align="left" valign="top">396.495 (7, 11904.96)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.225</td></tr><tr><td align="left" valign="top">Hallucination Score</td><td align="left" valign="top">919.048 (7, 12317.09)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.485</td></tr><tr><td align="left" valign="top">Gender Bias</td><td align="left" valign="top">17294.547 (7, 11451.45)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Toxicity Score</td><td align="left" valign="top">132.307 (7, 12270.64)</td><td align="left" valign="top">1.52E-188</td><td align="left" valign="top">0.074</td></tr><tr><td align="left" valign="top">Severe Toxicity</td><td align="left" valign="top">57.246 (7, 12846.45)</td><td align="left" valign="top">3.31E-81</td><td align="left" valign="top">0.031</td></tr><tr><td align="left" valign="top">Identity Attack</td><td align="left" valign="top">87.872 (7, 11603.12)</td><td align="left" valign="top">3.15E-125</td><td align="left" valign="top">0.052</td></tr><tr><td align="left" valign="top">Insult</td><td align="left" valign="top">167.374 (7, 12585.83)</td><td align="left" valign="top">1.11E-237</td><td align="left" valign="top">0.091</td></tr><tr><td align="left" valign="top">Profanity</td><td align="left" valign="top">66.154 (7, 12555.70)</td><td align="left" valign="top">4.07E-94</td><td align="left" valign="top">0.036</td></tr><tr><td align="left" valign="top">Threat</td><td align="left" valign="top">66.088 (7, 12724.36)</td><td align="left" valign="top">4.81E-94</td><td align="left" valign="top">0.036</td></tr><tr><td align="left" valign="top">Sexually Explicit</td><td align="left" valign="top">27.462 (7, 11935.72)</td><td align="left" valign="top">1.05E-37</td><td align="left" valign="top">0.015</td></tr><tr><td align="left" valign="top">Flirtation</td><td align="left" valign="top">636.676 (7, 12187.09)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.347</td></tr><tr><td align="left" valign="top">Flesch Reading Ease</td><td align="left" valign="top">536.854 (7, 12953.81)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.278</td></tr><tr><td align="left" valign="top">Flesch-Kincaid Grade Level</td><td align="left" valign="top">638.672 (7, 12984.04)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.328</td></tr><tr><td align="left" valign="top">Gunning Fog Index</td><td align="left" valign="top">719.172 (7, 12807.91)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.372</td></tr><tr><td align="left" valign="top">Smog Index</td><td align="left" valign="top">1952.647 (7, 12354.32)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.955</td></tr><tr><td align="left" valign="top">Automated Readability Index</td><td align="left" valign="top">272.98 (7, 13078.14)</td><td align="left" valign="top">&#x003C;1e-300</td><td align="left" valign="top">0.143</td></tr><tr><td align="left" valign="top">Coleman Liau Index</td><td align="left" valign="top">227.599 (7, 13148.51)</td><td align="left" valign="top">0.00E-02</td><td align="left" valign="top">0.119</td></tr><tr><td align="left" valign="top">Reflection Score</td><td align="left" valign="top">30.489 (7, 12297.43)</td><td align="left" valign="top">3.93E-42</td><td align="left" valign="top">0.017</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Columns report the <italic>F</italic> statistic, 2-sided <italic>P</italic> value, and epsilon-squared (&#x03B5;&#x00B2;) effect size. Extremely small <italic>P</italic> values are shown in scientific notation. &#x03B5;&#x00B2; provides the proportion of variance attributable to model differences in a 1-way (Welch) design; values closer to 1 indicate larger between-model effects. For genbit_score, &#x03B5;&#x00B2; is capped at 1.000 for interpretability because a simple &#x03B5;&#x00B2; approximation under Welch can slightly exceed 1 in rare cases; and the substantive conclusion (&#x201C;very large effect&#x201D;) is unchanged.</p></fn><fn id="table3fn2"><p><sup>b</sup>Columns report the Welch ANOVA <italic>F</italic> statistic with numerator and denominator degrees of freedom within parentheses, 2-sided <italic>P</italic> value, and epsilon-squared effect size. The numerator degree of freedom is 7 for all tests because 8 models were compared. Denominator degree of freedom values are Welch-adjusted and therefore vary by metric. Extremely small <italic>P</italic> values are shown in scientific notation.</p></fn></table-wrap-foot></table-wrap><p>To complement the omnibus tests, we performed comparisons for every metric and report the full results. Because multiple pairwise tests increase the likelihood of false positives, we adjusted <italic>P</italic> values using the Benjamini-Hochberg procedure to control the false discovery rate at 0.05. After Benjamini-Hochberg adjustment, a large share of model pairs remained statistically different (average significant-pair rate 0.82, median 0.86), indicating consistent between-model separation beyond simple rank ordering. On quality metrics, leading models&#x2014;particularly Llama-3 on BLEURT and BERTScore&#x2014;showed meaningful effect sizes (Hedges <italic>g</italic>), whereas safety metrics displayed smaller but still significant gaps (adjusted <italic>P</italic>&#x003C;.05).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we aimed to develop an evaluation framework for effective cancer communication with quantitative and qualitative elements, based on similar work in this field. Working with experts in health communication and health equity, we developed a community-centered evaluation framework that spans three main categories: (1) Linguistic Quality, (2) Safety and Trustworthiness, and (3) Communication Accessibility and Affectiveness. Our findings show that general-purpose LLMs, particularly Llama 3 and Gemma, outperformed specialized medical models in Linguistic Quality, producing more fluent and coherent responses. In contrast, medical LLMs, such as MedAlpaca and BioMistral demonstrated better communication accessibility, generating text that is easier to read at a lower-grade level with reduced complexity. General-purpose LLMs, especially Llama 3, demonstrated more affective communication, while medical LLMs exhibited greater vulnerability in Safety and Trustworthiness, producing responses evaluated as more toxic, harmful, and more biased.</p><p>General-purpose models such as Llama 3 and Gemma outperformed medical LLMs in fluency, coherence, and factual accuracy. Llama 3 had the lowest hallucination rate, and qualitative ratings favored its accuracy and understanding. Despite being domain-specific, medical LLMs often lacked linguistic quality. Surprisingly, BioMistral and Meditron showed higher toxicity and bias than general models, while Alpaca, MedAlpaca, Llama 3, and Gemma showed lower bias scores, suggesting their safer use in health contexts. Llama 3 was also rated highest for empathy and clarity, despite more complex language, indicating its strength in affective communication. In contrast, medical LLMs such as MedAlpaca generated simpler, more readable outputs suitable for public health.</p><p>Specialized medical LLMs, although fine-tuned for health care, underperformed in safety, coherence, and affectiveness, raising concerns for clinical use. Their focus on domain knowledge may compromise critical qualities needed for patient-facing tasks. To address this, future work should embed clinical communication standards (eg, empathy and clarity) and integrate external knowledge representations to improve recall, precision, and scalability [<xref ref-type="bibr" rid="ref65">65</xref>,<xref ref-type="bibr" rid="ref66">66</xref>]. Hybrid neurosymbolic approaches are recommended for safer and more clinically robust outputs [<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref68">68</xref>].</p></sec><sec id="s4-2"><title>Limitations</title><p>The limitations of this study include the use of only open-source models and benchmark datasets, which may not reflect proprietary systems or real patient interactions. Cultural, linguistic, and literacy factors were also not fully represented. Our hallucination metric, based on key-term variation, serves as a practical proxy for factual inconsistency but does not replace source-grounded verification. Future work will integrate retrieval-based fact-checking and uncertainty scoring to enhance robustness. Our ethical assessment relied on proxy measures (toxicity detectors, impersonation-based fairness deltas, and hallucination or error checks) rather than real-user or clinical deployment, so residual risks (eg, nuanced stigma, privacy, or safety impacts) may not be fully captured [<xref ref-type="bibr" rid="ref69">69</xref>]. As a next step, we plan to extend this work through simulated patient-clinician interaction studies using the same codebook and web interface developed for this project to evaluate usability, empathy, and real-world communication flow.</p></sec><sec id="s4-3"><title>Conclusions</title><p>This study evaluates how LLMs communicate breast and cervical cancers information, focusing on linguistic quality, safety, and affectiveness. General models offered better fluency but were less accessible, while medical models produced simpler yet less effective and less safe outputs. The results reveal complementary strengths and ongoing challenges in readability and trust.</p></sec></sec></body><back><ack><p>Generative artificial intelligence (AI) tools were used only for grammar and language editing during the preparation of this manuscript. No generative AI tool was used to create the study concept, methodology, analysis, interpretation of results, or substantive manuscript content. The authors wrote the manuscript and remain fully responsible for the accuracy, originality, and integrity of all content.</p></ack><notes><sec><title>Funding</title><p>This work is funded by Microsoft Accelerating Foundation Models Research Program. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of Microsoft.</p></sec><sec><title>Data Availability</title><p>The datasets used in this study are publicly available. Further information and materials related to the datasets may be obtained from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>AS contributed to the conceptualization, methodology design, formal analysis, investigation, data curation, software development, original draft writing, and visualization. VC contributed to conceptualization, methodology, investigation, formal analysis, validation, and writing &#x2013; review and editing. ADR contributed conceptualization, formal analysis, and validation. UK contributed to conceptualization, methodology, investigation, resources, writing &#x2013; review and editing, supervision, project administration, and funding acquisition. MYI contributed to supervision, project administration, resources, writing &#x2013; review and editing, and funding acquisition.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">&#x03BA;<sub>w</sub></term><def><p>weighted Cohen &#x03BA;</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Cancer facts &#x0026; figures 2023</article-title><source>American Cancer Society</source><year>2023</year><access-date>2026-05-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cancer.org/content/dam/cancer-org/research/cancer-facts-and-statistics/annual-cancer-facts-and-figures/2023/2023-cancer-facts-and-figures.pdf">https://www.cancer.org/content/dam/cancer-org/research/cancer-facts-and-statistics/annual-cancer-facts-and-figures/2023/2023-cancer-facts-and-figures.pdf</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Siegel</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Wagle</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Jemal</surname><given-names>A</given-names> </name></person-group><article-title>Cancer statistics, 2023</article-title><source>CA Cancer J Clin</source><year>2023</year><month>01</month><volume>73</volume><issue>1</issue><fpage>17</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.3322/caac.21763</pub-id><pub-id pub-id-type="medline">36633525</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Warner</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Tamimi</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Hughes</surname><given-names>ME</given-names> </name><etal/></person-group><article-title>Time to diagnosis and breast cancer stage by race/ethnicity</article-title><source>Breast Cancer Res Treat</source><year>2012</year><month>12</month><volume>136</volume><issue>3</issue><fpage>813</fpage><lpage>821</lpage><pub-id pub-id-type="doi">10.1007/s10549-012-2304-1</pub-id><pub-id pub-id-type="medline">23099438</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moore</surname><given-names>JX</given-names> </name><name name-style="western"><surname>Andrzejak</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>S</given-names> </name><name name-style="western"><surname>Han</surname><given-names>Y</given-names> </name></person-group><article-title>Exploring the intersectionality of race/ethnicity with rurality on breast cancer outcomes: SEER analysis, 2000-2016</article-title><source>Breast Cancer Res Treat</source><year>2023</year><month>02</month><volume>197</volume><issue>3</issue><fpage>633</fpage><lpage>645</lpage><pub-id pub-id-type="doi">10.1007/s10549-022-06830-x</pub-id><pub-id pub-id-type="medline">36520228</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><article-title>Key statistics for cervical cancer</article-title><source>American Cancer Society</source><year>2024</year><access-date>2025-05-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cancer.org/cancer/types/cervical-cancer/about/key-statistics.html">https://www.cancer.org/cancer/types/cervical-cancer/about/key-statistics.html</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olusola</surname><given-names>P</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>HN</given-names> </name><name name-style="western"><surname>Philley</surname><given-names>JV</given-names> </name><name name-style="western"><surname>Dasgupta</surname><given-names>S</given-names> </name></person-group><article-title>Human papilloma virus-associated cervical cancer and health disparities</article-title><source>Cells</source><year>2019</year><month>06</month><day>21</day><volume>8</volume><issue>6</issue><fpage>622</fpage><pub-id pub-id-type="doi">10.3390/cells8060622</pub-id><pub-id pub-id-type="medline">31234354</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moore de Peralta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Holaday</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hadoto</surname><given-names>IM</given-names> </name></person-group><article-title>Cues to cervical cancer screening among U.S. Hispanic women</article-title><source>Hisp Health Care Int</source><year>2017</year><month>03</month><volume>15</volume><issue>1</issue><fpage>5</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1177/1540415316682494</pub-id><pub-id pub-id-type="medline">28558513</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spencer</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Tiro</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Racial and ethnic disparities in cervical cancer screening from three U.S. healthcare settings</article-title><source>Am J Prev Med</source><year>2023</year><month>10</month><volume>65</volume><issue>4</issue><fpage>667</fpage><lpage>677</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2023.04.016</pub-id><pub-id pub-id-type="medline">37146839</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Consedine</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Magai</surname><given-names>C</given-names> </name><name name-style="western"><surname>Spiller</surname><given-names>R</given-names> </name><name name-style="western"><surname>Neugut</surname><given-names>AI</given-names> </name><name name-style="western"><surname>Conway</surname><given-names>F</given-names> </name></person-group><article-title>Breast cancer knowledge and beliefs in subpopulations of African American and Caribbean women</article-title><source>Am J Health Behav</source><year>2004</year><volume>28</volume><issue>3</issue><fpage>260</fpage><lpage>271</lpage><pub-id pub-id-type="doi">10.5993/ajhb.28.3.7</pub-id><pub-id pub-id-type="medline">15152885</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Best</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Vamos</surname><given-names>C</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Daley</surname><given-names>E</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>DB</given-names> </name></person-group><article-title>Increasing routine cancer screening among underserved populations through effective communication strategies: application of a health literacy framework</article-title><source>J Canc Educ</source><year>2017</year><month>06</month><volume>32</volume><issue>2</issue><fpage>213</fpage><lpage>217</lpage><pub-id pub-id-type="doi">10.1007/s13187-017-1194-7</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yalamanchili</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sengupta</surname><given-names>B</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Quality of large language model responses to radiation oncology patient care questions</article-title><source>JAMA Netw Open</source><year>2024</year><month>04</month><day>1</day><volume>7</volume><issue>4</issue><fpage>e244630</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.4630</pub-id><pub-id pub-id-type="medline">38564215</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Musheyev</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gross</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Readability and information quality in cancer information from a free vs paid Chatbot</article-title><source>JAMA Netw Open</source><year>2024</year><month>07</month><day>1</day><volume>7</volume><issue>7</issue><fpage>e2422275</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.22275</pub-id><pub-id pub-id-type="medline">39058491</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Jomy</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Performance of multimodal artificial intelligence chatbots evaluated on clinical oncology cases</article-title><source>JAMA Netw Open</source><year>2024</year><month>10</month><day>1</day><volume>7</volume><issue>10</issue><fpage>e2437711</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.37711</pub-id><pub-id pub-id-type="medline">39441598</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hom</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model influence on diagnostic reasoning: a randomized clinical trial</article-title><source>JAMA Netw Open</source><year>2024</year><volume>7</volume><issue>10</issue><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.39466</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chow</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name></person-group><article-title>Developing effective frameworks for large language model-based medical chatbots: insights from radiotherapy education with ChatGPT</article-title><source>JMIR Cancer</source><year>2025</year><month>02</month><day>18</day><volume>11</volume><issue>1</issue><fpage>e66633</fpage><pub-id pub-id-type="doi">10.2196/66633</pub-id><pub-id pub-id-type="medline">39965195</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chow</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name></person-group><article-title>Large language models in medical chatbots: opportunities, challenges, and the need to address AI risks</article-title><source>Information</source><year>2025</year><volume>16</volume><issue>7</issue><fpage>549</fpage><pub-id pub-id-type="doi">10.3390/info16070549</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><issue>1</issue><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grilo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marques</surname><given-names>C</given-names> </name><name name-style="western"><surname>Corte-Real</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carolino</surname><given-names>E</given-names> </name><name name-style="western"><surname>Caetano</surname><given-names>M</given-names> </name></person-group><article-title>Assessing the quality and reliability of ChatGPT&#x2019;s responses to radiotherapy-related patient queries: comparative study with GPT-3.5 and GPT-4</article-title><source>JMIR Cancer</source><year>2025</year><month>04</month><day>16</day><volume>11</volume><issue>1</issue><fpage>e63677</fpage><pub-id pub-id-type="doi">10.2196/63677</pub-id><pub-id pub-id-type="medline">40239208</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Zitu</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Le</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Duong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Haddadan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Garcia</surname><given-names>M</given-names> </name><name name-style="western"><surname>Amorrortu</surname><given-names>R</given-names> </name><etal/></person-group><source>Large Language Models in Cancer: Potentials, Risks, and Safeguards</source><year>2025</year><publisher-name>Oxford University Press</publisher-name><pub-id pub-id-type="doi">10.1093/bjrai/ubae019</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swire-Thompson</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lazer</surname><given-names>D</given-names> </name></person-group><article-title>Public health and online misinformation: challenges and recommendations</article-title><source>Annu Rev Public Health</source><year>2020</year><month>04</month><day>2</day><volume>41</volume><issue>1</issue><fpage>433</fpage><lpage>451</lpage><pub-id pub-id-type="doi">10.1146/annurev-publhealth-040119-094127</pub-id><pub-id pub-id-type="medline">31874069</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbasian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Khatibi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Azimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Foundation metrics for evaluating effectiveness of healthcare conversations powered by generative AI</article-title><source>NPJ Digit Med</source><year>2024</year><month>03</month><day>29</day><volume>7</volume><issue>1</issue><fpage>82</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01074-z</pub-id><pub-id pub-id-type="medline">38553625</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Enhancing uncertainty-based hallucination detection with stronger focus</article-title><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 6-10, 2023</conf-date><conf-loc>Singapore</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.58</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Perspective API</article-title><source>Jigsaw</source><year>2024</year><access-date>2026-05-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.perspectiveapi.com">https://www.perspectiveapi.com</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Erol</surname><given-names>A</given-names> </name><name name-style="western"><surname>Padhi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kursuncu</surname><given-names>U</given-names> </name><name name-style="western"><surname>Aktas</surname><given-names>ME</given-names> </name></person-group><article-title>Playing devil&#x2019;s advocate: unmasking toxicity and vulnerabilities in large vision-language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.09039</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>W</given-names> </name></person-group><article-title>Long-term trends in the incidence of male breast cancer and nomogram for predicting survival in male breast cancer patients: a population-based epidemiologic study</article-title><source>Sci Rep</source><year>2025</year><month>01</month><day>15</day><volume>15</volume><issue>1</issue><fpage>2027</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-85954-8</pub-id><pub-id pub-id-type="medline">39814936</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anderson</surname><given-names>WF</given-names> </name><name name-style="western"><surname>Jatoi</surname><given-names>I</given-names> </name><name name-style="western"><surname>Tse</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rosenberg</surname><given-names>PS</given-names> </name></person-group><article-title>Male breast cancer: a population-based comparison with female breast cancer</article-title><source>J Clin Oncol</source><year>2010</year><month>01</month><day>10</day><volume>28</volume><issue>2</issue><fpage>232</fpage><lpage>239</lpage><pub-id pub-id-type="doi">10.1200/JCO.2009.23.8162</pub-id><pub-id pub-id-type="medline">19996029</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sengupta</surname><given-names>K</given-names> </name><name name-style="western"><surname>Maher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Groves</surname><given-names>D</given-names> </name><name name-style="western"><surname>Olieman</surname><given-names>C</given-names> </name></person-group><article-title>GenBiT: measure and mitigate gender bias in language datasets</article-title><source>Microsoft J Appl Res</source><year>2021</year><access-date>2026-05-18</access-date><volume>16</volume><fpage>63</fpage><lpage>71</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.microsoft.com/en-us/research/publication/genbit-measure-and-mitigate-gender-bias-in-language-datasets/">https://www.microsoft.com/en-us/research/publication/genbit-measure-and-mitigate-gender-bias-in-language-datasets/</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Salewski</surname><given-names>L</given-names> </name><name name-style="western"><surname>Alaniz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rio-Torto</surname><given-names>I</given-names> </name><name name-style="western"><surname>Schulz</surname><given-names>E</given-names> </name><name name-style="western"><surname>Akata</surname><given-names>Z</given-names> </name></person-group><article-title>In-context impersonation reveals large language models&#x2019; strengths and biases</article-title><source>arXiv</source><comment>Preprint posted online on  May 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.14930</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Levy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Karver</surname><given-names>TS</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>WD</given-names> </name><name name-style="western"><surname>Kaufman</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name></person-group><article-title>Evaluating biases in context-dependent health questions</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 7, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.04858</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Poulain</surname><given-names>R</given-names> </name><name name-style="western"><surname>Fayyaz</surname><given-names>H</given-names> </name><name name-style="western"><surname>Beheshti</surname><given-names>R</given-names> </name></person-group><article-title>Bias patterns in the application of LLMs for clinical decision support: a comprehensive study</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.15149</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vela</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Erondu</surname><given-names>AI</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Peek</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Woodruff</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Chin</surname><given-names>MH</given-names> </name></person-group><article-title>Eliminating explicit and implicit biases in health care: evidence and research needs</article-title><source>Annu Rev Public Health</source><year>2022</year><month>04</month><day>5</day><volume>43</volume><issue>1</issue><fpage>477</fpage><lpage>501</lpage><pub-id pub-id-type="doi">10.1146/annurev-publhealth-052620-103528</pub-id><pub-id pub-id-type="medline">35020445</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salovey</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mayer</surname><given-names>JD</given-names> </name></person-group><article-title>Emotional Intelligence</article-title><source>Imagin Cogn Pers</source><year>1990</year><month>03</month><volume>9</volume><issue>3</issue><fpage>185</fpage><lpage>211</lpage><pub-id pub-id-type="doi">10.2190/DUGG-P24E-52WK-6CDG</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Tomkins</surname><given-names>S</given-names> </name></person-group><source>Affect Imagery Consciousness: Volume I: The Positive Affects</source><year>1962</year><publisher-name>Springer Publishing Company</publisher-name><pub-id pub-id-type="other">0826104428</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>FLESCH</surname><given-names>R</given-names> </name></person-group><article-title>A new readability yardstick</article-title><source>J Appl Psychol</source><year>1948</year><month>06</month><volume>32</volume><issue>3</issue><fpage>221</fpage><lpage>233</lpage><pub-id pub-id-type="doi">10.1037/h0057532</pub-id><pub-id pub-id-type="medline">18867058</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Kincaid</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Fishburne</surname><given-names>RP</given-names> </name><name name-style="western"><surname>Rogers</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Chissom</surname><given-names>BS</given-names> </name></person-group><article-title>Derivation of new readability formulas for navy enlisted personnel</article-title><year>1975</year><publisher-name>Naval Technical Training command. Research Branch Report</publisher-name><fpage>8</fpage><lpage>75</lpage></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gunning</surname><given-names>R</given-names> </name></person-group><source>The Technique of Clear Writing</source><year>1952</year><publisher-name>McGraw-Hill</publisher-name><pub-id pub-id-type="other">0070252068</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McLaughlin</surname><given-names>GH</given-names> </name></person-group><article-title>SMOG grading&#x2014;a new readability formula</article-title><source>J Reading</source><year>1969</year><access-date>2026-05-19</access-date><volume>12</volume><fpage>639</fpage><lpage>646</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://eric.ed.gov/?id=EJ004016">https://eric.ed.gov/?id=EJ004016</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Senter</surname><given-names>RJ</given-names> </name></person-group><article-title>Automated readability index</article-title><source>AMRL TR</source><year>1967</year><month>05</month><fpage>1</fpage><lpage>14</lpage><pub-id pub-id-type="medline">5302480</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coleman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liau</surname><given-names>TL</given-names> </name></person-group><article-title>A computer readability formula designed for machine scoring</article-title><source>J Appl Psychol</source><year>1975</year><volume>60</volume><issue>2</issue><fpage>283</fpage><lpage>284</lpage><pub-id pub-id-type="doi">10.1037/h0076540</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Min</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>P&#x00E9;rez-Rosas</surname><given-names>V</given-names> </name><name name-style="western"><surname>Resnicow</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mihalcea</surname><given-names>R</given-names> </name></person-group><article-title>PAIR: prompt-aware margin ranking for counselor reflection scoring in motivational interviewing</article-title><year>2022</year><conf-name>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><fpage>148</fpage><lpage>158</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.11</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Umapathi</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Sankarasubbu</surname><given-names>M</given-names> </name></person-group><article-title>MedMCQA: a large-scale multi-subject multi-choice dataset for medical domain question answering</article-title><source>PMLR</source><year>2022</year><access-date>2026-05-25</access-date><volume>174</volume><fpage>248</fpage><lpage>260</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v174/pal22a.html">https://proceedings.mlr.press/v174/pal22a.html</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name></person-group><article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title><source>Appl Sci (Basel)</source><year>2021</year><volume>11</volume><issue>14</issue><fpage>6421</fpage><pub-id pub-id-type="doi">10.3390/app11146421</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>J</given-names> </name></person-group><article-title>OLAPH: improving factuality in biomedical long-form question answering</article-title><source>arXiv</source><comment>Preprint posted online on  May 21, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.12701</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agichtein</surname><given-names>E</given-names> </name><name name-style="western"><surname>Carmel</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pelleg</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pinter</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Harman</surname><given-names>D</given-names> </name></person-group><article-title>Overview of the TREC 2015 LiveQA track</article-title><year>2015</year><access-date>2026-05-15</access-date><conf-name>Proceedings of the Twenty-Fourth Text REtrieval Conference, TREC 2015</conf-name><conf-date>Nov 17-20, 2015</conf-date><conf-loc>Gaithersburg, Maryland, USA</conf-loc><publisher-name>National Institute of Standards and Technology (NIST)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="http://trec.nist.gov/pubs/trec24/papers/Overview-QA.pdf">http://trec.nist.gov/pubs/trec24/papers/Overview-QA.pdf</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abacha</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Mrabet</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sharp</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goodwin</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Shooshan</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name></person-group><article-title>Bridging the gap between consumers&#x2019; medication questions and trusted answers</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>08</month><day>21</day><volume>264</volume><fpage>25</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.3233/SHTI190176</pub-id><pub-id pub-id-type="medline">31437878</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Manes</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ronn</surname><given-names>N</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ilan Ber</surname><given-names>R</given-names> </name><name name-style="western"><surname>Horowitz-Kugler</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Stanovsky</surname><given-names>G</given-names> </name></person-group><article-title>K-QA: a real-world medical Q&#x0026;A benchmark</article-title><conf-name>Proceedings of the 23rd Workshop on Biomedical Natural Language Processing</conf-name><conf-date>Aug 16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><fpage>277</fpage><lpage>294</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.bionlp-1.22</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>ChatDoctor: a medical chat model fine-tuned on a Large Language Model Meta-AI (LLaMA) using medical domain knowledge</article-title><source>Cureus</source><year>2023</year><month>06</month><volume>15</volume><issue>6</issue><fpage>e40895</fpage><pub-id pub-id-type="doi">10.7759/cureus.40895</pub-id><pub-id pub-id-type="medline">37492832</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Taori</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gulrajani</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dubois</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Alpaca: a strong, replicable instruction-following model</article-title><source>Stanford Center for Research on Foundation Models</source><year>2023</year><access-date>2026-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://crfm.stanford.edu/2023/03/13/alpaca.html">https://crfm.stanford.edu/2023/03/13/alpaca.html</ext-link></comment></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kadian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Dahle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Letman</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bamford</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chaplot</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Ddl</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.06825</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mesnard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hardin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dadashi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bhupatiraju</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gemma: open models based on Gemini research and technology</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 13, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.08295</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bommasani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tsipras</surname><given-names>D</given-names> </name><name name-style="western"><surname>Soylu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yasunaga</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Holistic evaluation of language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 16, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2211.09110</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Papaioannou</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Grundmann</surname><given-names>P</given-names> </name><name name-style="western"><surname>Oberhauser</surname><given-names>T</given-names> </name><name name-style="western"><surname>L&#x00F6;ser</surname><given-names>A</given-names> </name><etal/></person-group><article-title>MedAlpaca&#x2013;an open-source collection of medical conversational AI models and training data</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 14, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.08247</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Labrak</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bazoge</surname><given-names>A</given-names> </name><name name-style="western"><surname>Morin</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gourraud</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Rouvier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dufour</surname><given-names>R</given-names> </name></person-group><article-title>BioMistral: a collection of open-source pretrained large language models for medical domains</article-title><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>2024</conf-date><conf-loc>Bangkok, Thailand and virtual meeting</conf-loc><fpage>5848</fpage><lpage>5864</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.348</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cano</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Romanou</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bonnet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Matoba</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salvi</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Meditron-70b: scaling medical pretraining for large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.16079</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Leong</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>YF</given-names> </name><name name-style="western"><surname>Shuai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pamuksuz</surname><given-names>U</given-names> </name></person-group><article-title>E&#xFB00;icient fine-tuning of large language models for automated medical documentation</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 14, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.09324</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Welch</surname><given-names>BL</given-names> </name></person-group><article-title>On the comparison of several mean values: an alternative approach</article-title><source>Biometrika</source><year>1951</year><month>12</month><volume>38</volume><issue>3/4</issue><fpage>330</fpage><pub-id pub-id-type="doi">10.2307/2332579</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Games</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Howell</surname><given-names>JF</given-names> </name></person-group><article-title>Pairwise multiple comparison procedures with unequal N&#x2019;s and/or variances: a Monte Carlo study</article-title><source>J Educat Stat</source><year>1976</year><volume>1</volume><issue>2</issue><fpage>113</fpage><pub-id pub-id-type="doi">10.2307/1164979</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hedges</surname><given-names>LV</given-names> </name></person-group><article-title>Distribution theory for Glass&#x2019;s estimator of effect size and related estimators</article-title><source>J Educ Behav Stat</source><year>1981</year><month>06</month><volume>6</volume><issue>2</issue><fpage>107</fpage><lpage>128</lpage><pub-id pub-id-type="doi">10.3102/10769986006002107</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>SO</given-names> </name></person-group><article-title>Can Likert scales be treated as interval scales?&#x2014;a simulation study</article-title><source>J Soc Serv Res</source><year>2017</year><month>08</month><day>8</day><volume>43</volume><issue>4</issue><fpage>527</fpage><lpage>532</lpage><pub-id pub-id-type="doi">10.1080/01488376.2017.1329775</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="web"><article-title>Simply put: a guide for creating easy-to-understand materials</article-title><source>Centers for Disease Control and Prevention</source><year>2009</year><access-date>2026-05-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/healthliteracy/pdf/simply_put.pdf">https://www.cdc.gov/healthliteracy/pdf/simply_put.pdf</ext-link></comment></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>An application of hierarchical kappa-type statistics in the assessment of majority agreement among multiple observers</article-title><source>Biometrics</source><year>1977</year><month>06</month><volume>33</volume><issue>2</issue><fpage>363</fpage><lpage>374</lpage><pub-id pub-id-type="doi">10.2307/2529786</pub-id><pub-id pub-id-type="medline">884196</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>T</given-names> </name></person-group><article-title>Kappa statistic considerations in evaluating inter-rater reliability between two raters: which, when and context matters</article-title><source>BMC Cancer</source><year>2023</year><volume>23</volume><issue>1</issue><fpage>799</fpage><pub-id pub-id-type="doi">10.1186/s12885-023-11325-z</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Knowledge-infused prompting: assessing and advancing clinical text data generation with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 1, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.00287</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Khandelwal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Gaur</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kursuncu</surname><given-names>U</given-names> </name><name name-style="western"><surname>Shalin</surname><given-names>VL</given-names> </name><name name-style="western"><surname>Sheth</surname><given-names>AP</given-names> </name></person-group><article-title>A domain-agnostic neurosymbolic approach for big social data analysis: evaluating mental health sentiment on social media during COVID-19</article-title><year>2024</year><conf-name>2024 IEEE International Conference on Big Data, BigData 2024</conf-name><conf-date>Dec 15-18, 2024</conf-date><conf-loc>Washington, USA</conf-loc><publisher-name>Institute of Electrical and Electronics Engineers Inc</publisher-name><fpage>959</fpage><lpage>968</lpage><pub-id pub-id-type="doi">10.1109/BigData62323.2024.10825174</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Meng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>L</given-names> </name></person-group><article-title>Knowledge distillation in medical data mining: a survey</article-title><conf-name>5th International Conference on Crowd Science and Engineering</conf-name><conf-date>Oct 16-18, 2021</conf-date><conf-loc>Jinan, China</conf-loc><fpage>175</fpage><lpage>182</lpage><pub-id pub-id-type="doi">10.1145/3503181.3503211</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Garg</surname><given-names>R</given-names> </name><name name-style="western"><surname>Padhi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kursuncu</surname><given-names>U</given-names> </name><name name-style="western"><surname>Kumaraguru</surname><given-names>P</given-names> </name></person-group><article-title>Just KIDDIN: knowledge infusion and distillation for detection of indecent memes</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 19, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2411.12174</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chow</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name></person-group><article-title>Ethical considerations in human-centered AI: advancing oncology chatbots through large language models</article-title><source>JMIR Bioinform Biotech</source><year>2024</year><volume>5</volume><fpage>e64406</fpage><pub-id pub-id-type="doi">10.2196/64406</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Sample question from the curated dataset.</p><media xlink:href="cancer_v12i1e82971_app1.pdf" xlink:title="PDF File, 86 KB"/></supplementary-material></app-group></back></article>