<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Cancer</journal-id><journal-id journal-id-type="publisher-id">cancer</journal-id><journal-id journal-id-type="index">21</journal-id><journal-title>JMIR Cancer</journal-title><abbrev-journal-title>JMIR Cancer</abbrev-journal-title><issn pub-type="epub">2369-1999</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e72839</article-id><article-id pub-id-type="doi">10.2196/72839</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>ChatGPT Versus DeepSeek for Breast Cancer Information Retrieval: Quantitative Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hajjo</surname><given-names>Rima</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sabbah</surname><given-names>Dima A</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bardaweel</surname><given-names>Sanaa K</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Pharmacy, Faculty of Pharmacy, Al-Zaytoonah University of Jordan</institution><addr-line>Airport Street, P.O. Box 130</addr-line><addr-line>Amman</addr-line><country>Jordan</country></aff><aff id="aff2"><institution>Department of Pharmaceutical Sciences, School of Pharmacy, University of Jordan</institution><addr-line>Amman</addr-line><country>Jordan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Bender</surname><given-names>Jackie</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mesko</surname><given-names>Bertalan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Xu</surname><given-names>Dong</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ogunsakin</surname><given-names>Jamiu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Rima Hajjo, PhD, Department of Pharmacy, Faculty of Pharmacy, Al-Zaytoonah University of Jordan, Airport Street, P.O. Box 130, Amman, 11733, Jordan, 962 64291511; <email>r.hajjo@zuj.edu.jo</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>27</day><month>2</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e72839</elocation-id><history><date date-type="received"><day>19</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>19</day><month>11</month><year>2025</year></date></history><copyright-statement>&#x00A9; Rima Hajjo, Dima A Sabbah, Sanaa K Bardaweel. Originally published in JMIR Cancer (<ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org">https://cancer.jmir.org</ext-link>), 27.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cancer, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org/">https://cancer.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://cancer.jmir.org/2026/1/e72839"/><abstract><sec><title>Background</title><p>Artificial intelligence (AI) is increasingly used to generate medical content, yet its performance in delivering clinically relevant and reliable information remains underexplored, especially in complex areas such as breast cancer.</p></sec><sec><title>Objective</title><p>This study aimed to compare ChatGPT-4.0 and DeepSeek-V3 in generating breast cancer information, focusing on readability, content quality, and citation reliability.</p></sec><sec sec-type="methods"><title>Methods</title><p>On the basis of publicly available patient education materials, 10 frequently asked questions were selected. Each model generated 60 responses. Three expert reviewers rated each response using a 7-point Likert scale across 5 dimensions (ie, accuracy, completeness, clarity, depth and insight, and alignment with expert answers). Readability was assessed using Flesch-Kincaid Grade Level scores. Information reliability was evaluated through interrater agreement metrics, including Cohen &#x03BA; and Fleiss &#x03BA;. Paired <italic>t</italic> tests were used for statistical comparisons.</p></sec><sec sec-type="results"><title>Results</title><p>AI models produced significantly more readable content than expert references (mean Flesch-Kincaid Grade Level difference &#x2212;2.60; <italic>P</italic>&#x003C;.001). ChatGPT-4.0 responses were more stylistically consistent with a median Flesch-Kincaid Grade Level score of 10.66 (IQR 0.98), whereas DeepSeek-V3 showed greater variability with a median Flesch-Kincaid Grade Level score of 10.17 (IQR 1.41). Content quality scores were DeepSeek-V3 achieving a higher mean score than ChatGPT-4.0 (6.22 [SD 0.43] vs 6.01 [SD 0.49]). In the multiresponse analysis, DeepSeek-V3 demonstrated a statistically significant advantage in accuracy (<italic>P</italic>=.041), while differences across other criteria were not statistically significant (<italic>P</italic>&#x003E;.05). Human raters showed almost perfect agreement when judging source reliability (Fleiss &#x03BA;=0.842 for ChatGPT&#x2019;s citations and 0.935 for DeepSeek&#x2019;s citations). Agreement between each model&#x2019;s citation reliability scores and the expert majority was substantial for ChatGPT (Cohen &#x03BA;=0.665) and higher for DeepSeek (Cohen &#x03BA;=0.782).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Both models generated readable and clinically relevant content with comparable overall performance. ChatGPT provided more consistent readability, while DeepSeek offered more diverse references with stronger alignment to expert ratings. Continued evaluation and quality assurance are essential for the responsible clinical use of AI-generated content.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>breast cancer</kwd><kwd>ChatGPT</kwd><kwd>DeepSeek</kwd><kwd>large language models</kwd><kwd>AI</kwd><kwd>LLM</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Breast cancer is the most commonly diagnosed cancer among women and remains a major global health burden. Despite advancements in detection and treatment, late-stage presentation continues to hinder outcomes [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. The World Health Organization highlights the need for improved screening and care access [<xref ref-type="bibr" rid="ref5">5</xref>], while socioeconomic factors such as education and income shape care-seeking behavior. Many women turn to the internet for information on early signs and symptoms [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], although online health content is often inaccurate or misleading [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>In response, large language models (LLMs), such as ChatGPT and DeepSeek, have emerged as potential tools for generating medical information and answering patient queries [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Trained on extensive biomedical literature and general data sources, these generative artificial intelligence (AI) models can produce natural language responses to diverse health-related questions [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], potentially supporting patient education and access to general medical knowledge [<xref ref-type="bibr" rid="ref20">20</xref>]. Promising results have been demonstrated across multiple areas, including medical imaging, disease detection, drug discovery, and personalized oncology, with some AI models surpassing traditional diagnostic accuracy [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. In oncology, AI aids in drug delivery, genomics integration, and real-time monitoring to improve outcomes [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Nonetheless, concerns persist regarding the accuracy, readability, and reliability of AI-generated content, especially in complex domains such as breast cancer, necessitating further validation studies [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>A recent work by Ye et al [<xref ref-type="bibr" rid="ref31">31</xref>] systematically evaluated ChatGPT-3.5&#x2019;s performance in responding to frequently asked questions (FAQs) on cervical and breast cancer. Their study used 10 validated FAQs developed through expert consensus and compared ChatGPT&#x2019;s responses to those of experienced gynecologists and mammography specialists. Using a 7-point Likert scale, they assessed multiple dimensions of response quality, including accuracy, readability, consistency, reliability, efficiency, and relevance. While ChatGPT 3.5 demonstrated high accuracy, often comparable to or exceeding physician responses, it also exhibited limitations in readability, consistency, and the reliability of cited sources. Although foundational, this study did not evaluate more advanced generative AI models, such as ChatGPT-4.0 or DeepSeek-V3, whose capabilities remain largely unexplored [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Thus, further research is needed to assess how newer models compare in generating high-quality, clinically relevant patient education content.</p><p>Therefore, the aim of this study was to systematically evaluate and compare ChatGPT-4.0 and DeepSeek-V3 in their ability to generate high-quality, patient-facing breast cancer information. Using an established, expert-validated set of breast cancer FAQs, we assessed each model across multiple dimensions of response quality, including readability, accuracy, completeness, clarity, depth, insight, alignment with expert reference answers, and citation reliability, with the goal of determining whether these systems can provide trustworthy, clinically relevant educational content for patients.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Workflow</title><p>This study evaluated the performance of generative AI models in answering 10 FAQs related to breast cancer as described in the workflow shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The 10 validated questions and their corresponding expert reference answers were adopted from the study by Ye et al [<xref ref-type="bibr" rid="ref31">31</xref>], which served as the benchmark dataset for this evaluation. In their foundational study, Ye et al [<xref ref-type="bibr" rid="ref31">31</xref>] developed a list of FAQs regarding breast cancer and cervical cancer based on popular science materials and current hot topics related to tertiary prevention. A set of 10 questions was designed for each cancer type. Subsequently, experts were invited to review the proposed questions and to formulate expert consensus answers. The final set of questions and expert consensus answers were determined through several rounds of consultation and assessment.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow for evaluating the performance of ChatGPT and DeepSeek in retrieving and presenting medical information. FKGL: Flesch-Kincaid Grade Level.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e72839_fig01.png"/></fig></sec><sec id="s2-2"><title>Data Collection and Processing</title><p><xref ref-type="fig" rid="figure2">Figure 2</xref> summarizes the data collection and assessment procedures used to evaluate the performance of ChatGPT-4.0 and DeepSeek-V3 across three core dimensions: readability, content quality, and information reliability. Each objective shown in <xref ref-type="fig" rid="figure2">Figure 2</xref> was addressed using a tailored methodology aligned with specific types of data and evaluation criteria.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Workflow of data collection for assessment objectives. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e72839_fig02.png"/></fig></sec><sec id="s2-3"><title>AI Response Generation</title><p>To evaluate AI-generated content, we submitted 10 breast cancer FAQs to two generative AI models: ChatGPT-4.0 (OpenAI) and DeepSeek-V3 (DeepSeek). To ensure reproducibility and minimize bias, 3 independent researchers queried each model twice per question at different times, locations, and internet connections, yielding 6 responses per question per model (total: 60 responses per model and 120 responses overall).</p></sec><sec id="s2-4"><title>Expert Reference Answers</title><p>Expert reference answers for each FAQ were obtained from the consensus of 5 board-certified physicians specializing in mammography, as reported by Ye et al [<xref ref-type="bibr" rid="ref31">31</xref>]. These physicians independently reviewed each FAQ and provided responses, which were then harmonized into a single consensus answer per question. This consensus served as the gold standard for evaluating AI performance.</p><p>The 10 breast cancer FAQs and the corresponding AI-generated answers from each independent researcher are provided in Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref><xref ref-type="supplementary-material" rid="app2"/>-<xref ref-type="supplementary-material" rid="app3">3</xref>, and the expert consensus reference answers are provided in Table S4 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p></sec><sec id="s2-5"><title>Data Analysis</title><sec id="s2-5-1"><title>Readability Assessment</title><p>The readability evaluation was performed using the Flesch-Kincaid Grade Level (FKGL) analysis [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. The textstat library (version 0.7.7; developed by Shivam Bansal, open-source project) [<xref ref-type="bibr" rid="ref36">36</xref>] in Python was used to compute the FKGL scores for expert reference answers and responses generated by ChatGPT-4.0 and DeepSeek-V3. FKGL estimates the US school grade level required to understand a text, with higher scores indicating greater complexity. The FKGL score was calculated using the following formula:</p><p><inline-formula><mml:math id="ieqn1"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>F</mml:mi><mml:mi>K</mml:mi><mml:mi>G</mml:mi><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mn>0.39</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>w</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>d</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:mfrac><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mn>11.8</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>s</mml:mi><mml:mi>y</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>w</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>d</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:mfrac><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>15.59</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p><p>This analysis used two datasets: dataset 1, where expert reference answers were compared to the initial responses from ChatGPT-4.0 or DeepSeek-V3 for all questions (Q1-Q10); and dataset 2, where 3 researchers generated 2 responses each from ChatGPT-4.0 and DeepSeek-V3 for Q1-Q10, resulting in 60 responses per model. To evaluate statistical significance, paired <italic>t</italic> tests were conducted using the SciPy library (scipy.stats library; SciPy Developers, open-source project) within Python [<xref ref-type="bibr" rid="ref37">37</xref>], as FKGL scores were normally distributed. These tests assessed mean differences between ChatGPT-4.0 and DeepSeek-V3 while ensuring paired comparisons across identical question sets.</p></sec><sec id="s2-5-2"><title>Content Quality Assessment</title><p>Content quality was evaluated across five predefined dimensions: accuracy, completeness, clarity, depth and insight, and alignment with expert reference answers. These dimensions were adapted from established guidelines for assessing the quality of health information and AI-generated content [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Each AI-generated answer was independently scored by 3 expert reviewers using a 7-point Likert-type scale, where 1 represented poor performance and 7 represented excellent performance.</p><p>To compare the performance of ChatGPT-4.0 and DeepSeek-V3 across these dimensions, statistical analysis was performed using Python&#x2019;s SciPy library (scipy.stats module, version 1.15.1; SciPy Developers, open-source project) [<xref ref-type="bibr" rid="ref37">37</xref>]. The Shapiro-Wilk test was first applied to assess the normality of score distributions [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. As the data met the normality assumptions, paired <italic>t</italic> tests [<xref ref-type="bibr" rid="ref42">42</xref>] were conducted to compare mean scores across the 5 evaluation criteria. This approach enabled a robust and consistent comparison of content quality between the 2 models using identical question sets and standardized scoring methods.</p></sec><sec id="s2-5-3"><title>Information Reliability Assessment</title><p>The information sources cited by ChatGPT-4.0 and DeepSeek-V3 were independently evaluated by 3 domain experts (RH, DAS, and SKB) with expertise in medicinal chemistry (all), molecular oncology (SKB), and computational drug discovery (RH and DAS). Each unique cited reference was assigned a binary reliability score: 1 for reliable sources (peer-reviewed journal articles, clinical guidelines, or official or authoritative bodies, such as national health agencies or major academic medical centers) and 0 for unreliable sources (commercial health media, foundation blogs, unsourced claims, or otherwise nonverifiable material). Complete source lists and scores are provided in Table S5 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> (ChatGPT-4.0) and Table S6 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref> (DeepSeek-V3), and the full question-level reference data for both ChatGPT-4.0 and DeepSeek-V3 are provided in Table S7 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>.</p><p>For each AI model, we then performed 2 levels of agreement analysis. First, we assessed interrater reliability among the three human experts using Fleiss &#x03BA; [<xref ref-type="bibr" rid="ref43">43</xref>], and we also quantified the proportion of references with perfect agreement (all 3 raters assigned the same score), majority agreement (at least two raters agreed on the same score), and complete disagreement. Second, we evaluated how well each model aligned with human judgment by comparing the model&#x2019;s score for each reference to the majority human score (the score agreed upon by &#x2265;2 raters). Agreement between each AI model and this consensus was quantified using Cohen &#x03BA; [<xref ref-type="bibr" rid="ref44">44</xref>].</p><p>To further characterize disagreement, we identified references where ChatGPT or DeepSeek diverged from the expert majority score and summarized the frequency and nature of those disagreements. All scoring data (individual rater scores, AI-assigned scores, and majority scores) and all agreement calculations (Fleiss &#x03BA; and Cohen &#x03BA;) were generated in Python (pandas, NumPy, and scikit-learn) [<xref ref-type="bibr" rid="ref45">45</xref>-<xref ref-type="bibr" rid="ref47">47</xref>] and compiled into Microsoft Excel for audit trails, including disagreement tables and summary statistics for each model.</p><p>This evaluation framework allowed us to simultaneously measure (1) consistency among human raters when judging citation reliability and (2) how closely each AI system&#x2019;s choices agreed with what experts considered acceptable evidence.</p></sec></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study did not involve human participants, patient data, or intervention and therefore did not require review or approval by an institutional review board or research ethics committee. The analyses were based exclusively on AI-generated text responses and publicly available information sources, without collection of any personal, identifiable, or sensitive data. As no human subjects were recruited or interacted with, informed consent was not required, and no consent waiver was necessary. No compensation was provided. All data were handled in accordance with principles of data minimization and confidentiality, and no private or proprietary information was accessed or stored during the study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Design and Analytical Framework</title><p>To evaluate the performance of ChatGPT and DeepSeek, a structured analytical framework was implemented, as illustrated in Figure 1. A comparison of ChatGPT and DeepSeek was conducted using 10 FAQs about breast cancer, with responses evaluated against consensus expert reference answers as outlined in the Methods section. The workflow, illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>, consisted of three key steps: data collection, model querying, and visual and statistical analysis. In the data collection phase, 10 common breast cancer questions were identified, and expert consensus answers were established. Three independent researchers then queried ChatGPT and DeepSeek, generating responses. Two datasets were created for each model: dataset 1, consisting of single-instance responses (1 response per question); and dataset 2, containing multiresponse outputs (multiple responses per question), resulting in 10 responses from each model for dataset 1 and 60 responses from each model for dataset 2<xref ref-type="fig" rid="figure1">Figure 1</xref>.</p></sec><sec id="s3-2"><title>Readability Assessment</title><p>The FKGL readability scores for responses generated by ChatGPT and DeepSeek, compared against expert reference answers, are presented in <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref> and <xref ref-type="fig" rid="figure3">Figure 3</xref>. Two types of analyses were performed: single-instance analysis, where each AI model provided a single response per question; and multiresponse analysis, where multiple responses per question were collected by 3 researchers independently from both models.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Readability assessment using Flesch-Kincaid Grade Level (FKGL) scores for expert reference answers, ChatGPT, and DeepSeek responses. (A) Bar plot comparing FKGL scores for 10 questions using 1 response per question from each platform. The bars represent the FKGL scores for reference, ChatGPT, and DeepSeek. (B) Line plot showing the overall trends of FKGL scores across all questions using 1 response per question from each platform. The lines represent the FKGL scores for reference, ChatGPT, and DeepSeek. (C) Box plot showing the distribution of FKGL scores using 1 response per question from each platform. The boxes represent the IQR, with the median annotated inside each box. Reference, ChatGPT, and DeepSeek are represented in blue, orange, and green, respectively. (D) Bar plot comparing FKGL scores for 10 questions using 6 responses per question per platform. The bars represent the average FKGL scores, with error bars indicating the SD. ChatGPT responses are shown in orange, and DeepSeek responses are shown in green. (E) Line plot showing the overall trends of FKGL scores for ChatGPT and DeepSeek responses across 10 questions using 6 responses per question per platform. The solid lines represent the mean FKGL scores, and the shaded regions indicate the SDs. (F) Box plot showing the distribution of FKGL scores for ChatGPT and DeepSeek responses across 10 questions using 6 responses per question per platform. The boxes represent the IQR, with the median annotated inside each box. The <italic>y</italic>-axis for all line and box plots ranges from 7 to 15 to ensure a clear comparison of FKGL variations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e72839_fig03.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of readability scores of Flesch-Kincaid Grade Level (FKGL) of single-instance analysis for expert reference answers, ChatGPT-4.0, and DeepSeek-V3.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Question</td><td align="left" valign="top">Expert reference answer<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">ChatGPT-4.0<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top">DeepSeek-V3<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">FKGL score</td><td align="left" valign="top">FKGL score</td><td align="left" valign="top">Difference<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">FKGL score</td><td align="left" valign="top">Difference<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Q1</td><td align="left" valign="top">7.30</td><td align="left" valign="top">10.80</td><td align="left" valign="top">3.50</td><td align="left" valign="top">9.90</td><td align="left" valign="top">2.60</td></tr><tr><td align="left" valign="top">Q2</td><td align="left" valign="top">9.90</td><td align="left" valign="top">11.30</td><td align="left" valign="top">1.40</td><td align="left" valign="top">10.30</td><td align="left" valign="top">0.40</td></tr><tr><td align="left" valign="top">Q3</td><td align="left" valign="top">9.90</td><td align="left" valign="top">11.20</td><td align="left" valign="top">1.30</td><td align="left" valign="top">9.70</td><td align="left" valign="top">&#x2212;0.20</td></tr><tr><td align="left" valign="top">Q4</td><td align="left" valign="top">9.50</td><td align="left" valign="top">10.90</td><td align="left" valign="top">1.40</td><td align="left" valign="top">12.40</td><td align="left" valign="top">2.90</td></tr><tr><td align="left" valign="top">Q5</td><td align="left" valign="top">11.30</td><td align="left" valign="top">9.30</td><td align="left" valign="top">&#x2212;2.00</td><td align="left" valign="top">10.60</td><td align="left" valign="top">&#x2212;0.70</td></tr><tr><td align="left" valign="top">Q6</td><td align="left" valign="top">11.00</td><td align="left" valign="top">9.00</td><td align="left" valign="top">&#x2212;2.00</td><td align="left" valign="top">10.60</td><td align="left" valign="top">&#x2212;0.40</td></tr><tr><td align="left" valign="top">Q7</td><td align="left" valign="top">12.80</td><td align="left" valign="top">8.40</td><td align="left" valign="top">&#x2212;4.40</td><td align="left" valign="top">8.40</td><td align="left" valign="top">&#x2212;4.40</td></tr><tr><td align="left" valign="top">Q8</td><td align="left" valign="top">8.10</td><td align="left" valign="top">11.30</td><td align="left" valign="top">3.20</td><td align="left" valign="top">9.90</td><td align="left" valign="top">1.80</td></tr><tr><td align="left" valign="top">Q9</td><td align="left" valign="top">14.80</td><td align="left" valign="top">10.30</td><td align="left" valign="top">&#x2212;4.50</td><td align="left" valign="top">10.70</td><td align="left" valign="top">&#x2212;4.10</td></tr><tr><td align="left" valign="top">Q10</td><td align="left" valign="top">10.70</td><td align="left" valign="top">8.60</td><td align="left" valign="top">&#x2212;2.10</td><td align="left" valign="top">10.90</td><td align="left" valign="top">0.20</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Statistics across all the responses for 10 questions are as follows: mean 10.53 (SD 2.17) and median 10.30 (IQR 1.63).</p></fn><fn id="table1fn2"><p><sup>b</sup>Statistics across all the responses for 10 questions are as follows: mean 10.11 (SD 1.17) and median 10.55 (IQR 2.05).</p></fn><fn id="table1fn3"><p><sup>c</sup>Statistics across all the responses for 10 questions are as follows: mean 10.53 (SD 1.02) and median 10.45 (IQR 0.77).</p></fn><fn id="table1fn4"><p><sup>d</sup>Difference in comparison to reference. Positive difference indicates that the FKGL score was higher than that of reference, whereas negative difference means that the KFGL scores were lower than that of reference.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of readability scores of Flesch-Kincaid Grade Level (FKGL) of multiresponse analysis for expert reference answers, ChatGPT-4.0, and DeepSeek-V3.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Question</td><td align="left" valign="bottom">Expert reference answer<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="3">ChatGPT-4.0<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="3">DeepSeek-V3<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr><tr><td align="left" valign="bottom">FKGL score</td><td align="left" valign="bottom">FKGL score</td><td align="left" valign="bottom">Difference<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="bottom">SD</td><td align="left" valign="bottom">FKGL score</td><td align="left" valign="bottom">Difference<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="bottom">SD</td></tr></thead><tbody><tr><td align="left" valign="top">Q1</td><td align="char" char="." valign="top">7.30</td><td align="char" char="." valign="top">8.63</td><td align="char" char="." valign="top">1.33</td><td align="char" char="." valign="top">1.32</td><td align="char" char="." valign="top">8.73</td><td align="char" char="." valign="top">1.43</td><td align="char" char="." valign="top">1.43</td></tr><tr><td align="left" valign="top">Q2</td><td align="char" char="." valign="top">9.90</td><td align="char" char="." valign="top">11.20</td><td align="char" char="." valign="top">1.30</td><td align="char" char="." valign="top">0.77</td><td align="char" char="." valign="top">11.47</td><td align="char" char="." valign="top">1.57</td><td align="char" char="." valign="top">0.78</td></tr><tr><td align="left" valign="top">Q3</td><td align="char" char="." valign="top">9.90</td><td align="char" char="." valign="top">11.17</td><td align="char" char="." valign="top">1.27</td><td align="char" char="." valign="top">0.76</td><td align="char" char="." valign="top">9.73</td><td align="char" char="." valign="top">&#x2212;0.17</td><td align="char" char="." valign="top">0.32</td></tr><tr><td align="left" valign="top">Q4</td><td align="char" char="." valign="top">9.50</td><td align="char" char="." valign="top">10.93</td><td align="char" char="." valign="top">1.43</td><td align="char" char="." valign="top">0.67</td><td align="char" char="." valign="top">11.67</td><td align="char" char="." valign="top">2.17</td><td align="char" char="." valign="top">0.79</td></tr><tr><td align="left" valign="top">Q5</td><td align="char" char="." valign="top">11.30</td><td align="char" char="." valign="top">10.42</td><td align="char" char="." valign="top">&#x2212;0.88</td><td align="char" char="." valign="top">1.06</td><td align="char" char="." valign="top">10.68</td><td align="char" char="." valign="top">&#x2212;0.62</td><td align="char" char="." valign="top">0.67</td></tr><tr><td align="left" valign="top">Q6</td><td align="char" char="." valign="top">11.00</td><td align="char" char="." valign="top">10.17</td><td align="char" char="." valign="top">&#x2212;0.83</td><td align="char" char="." valign="top">1.09</td><td align="char" char="." valign="top">11.2</td><td align="char" char="." valign="top">0.20</td><td align="char" char="." valign="top">0.96</td></tr><tr><td align="left" valign="top">Q7</td><td align="char" char="." valign="top">12.80</td><td align="char" char="." valign="top">9.47</td><td align="char" char="." valign="top">&#x2212;3.33</td><td align="char" char="." valign="top">1.71</td><td align="char" char="." valign="top">9.63</td><td align="char" char="." valign="top">&#x2212;3.17</td><td align="char" char="." valign="top">1.00</td></tr><tr><td align="left" valign="top">Q8</td><td align="char" char="." valign="top">8.10</td><td align="char" char="." valign="top">10.92</td><td align="char" char="." valign="top">2.82</td><td align="char" char="." valign="top">0.38</td><td align="char" char="." valign="top">9.20</td><td align="char" char="." valign="top">1.10</td><td align="char" char="." valign="top">0.71</td></tr><tr><td align="left" valign="top">Q9</td><td align="char" char="." valign="top">14.80</td><td align="char" char="." valign="top">10.90</td><td align="char" char="." valign="top">&#x2212;3.90</td><td align="char" char="." valign="top">1.12</td><td align="char" char="." valign="top">10.37</td><td align="char" char="." valign="top">&#x2212;4.43</td><td align="char" char="." valign="top">0.66</td></tr><tr><td align="left" valign="top">Q10</td><td align="char" char="." valign="top">10.70</td><td align="char" char="." valign="top">9.83</td><td align="char" char="." valign="top">&#x2212;0.87</td><td align="char" char="." valign="top">1.70</td><td align="char" char="." valign="top">9.97</td><td align="char" char="." valign="top">&#x2212;0.73</td><td align="char" char="." valign="top">0.66</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Statistics across all the responses for 10 questions are as follows: mean 10.53 (SD 2.17) and median 10.30 (IQR 1.63).</p></fn><fn id="table2fn2"><p><sup>b</sup>Statistics across all the responses for 10 questions are as follows: mean 10.36 (SD 0.84) and median 10.66 (IQR 1.01).</p></fn><fn id="table2fn3"><p><sup>c</sup>Statistics across all the responses for 10 questions are as follows: mean 10.27 (SD 0.98) and median 10.17 (IQR: 1.41).</p></fn><fn id="table2fn4"><p><sup>d</sup>Difference in comparison to reference. Positive difference indicates that the FKGL score was higher than that of reference, whereas negative difference means that the FKGL scores were lower than that of reference.</p></fn></table-wrap-foot></table-wrap><p>In the single-instance analysis, ChatGPT&#x2019;s mean FKGL score (10.11, SD 1.17) was slightly lower than the expert reference (10.53, SD 2.17), whereas DeepSeek&#x2019;s mean score (10.53, SD 1.02) matched the expert reference answer exactly. In the multiresponse analysis, ChatGPT&#x2019;s mean FKGL increased to 10.36 (SD 0.84), whereas DeepSeek&#x2019;s mean FKGL slightly decreased to 10.27 (SD 0.98). ChatGPT demonstrated reduced variability in the multiresponse case (SD 0.84) compared to the single-instance case (SD 1.17), indicating greater consistency when averaging multiple responses. In contrast, DeepSeek&#x2019;s variability remained relatively stable across both analyses (SD 1.02 in single instance and 0.98 in multiresponse).</p><p>In both analyses, DeepSeek&#x2019;s median FKGL scores were slightly lower than ChatGPT&#x2019;s, suggesting that DeepSeek&#x2019;s responses may be more concise or readable. DeepSeek also exhibited less fluctuation in readability scores, as indicated by its smaller IQR 0.77 (median 10.45) in single instance and 1.41 (median 10.17) in multiresponse. In contrast, ChatGPT&#x2019;s variability decreased in the multiresponse case, with its IQR narrowing from 2.05 (median 10.55) to 1.01 (median 10.66).</p><p>A detailed evaluation of FKGL readability scores reveals variations in how ChatGPT and DeepSeek generate responses compared to expert references. For some questions (eg, Q1, Q2, Q3, and Q4), both models produced more complex responses (higher FKGL scores), potentially reducing accessibility for general readers but could be more suitable for professionals and specialists seeking in-depth information.</p><p>In contrast, for other questions (eg, Q5, Q6, Q7, and Q9), DeepSeek generally remained closer to the reference readability (reference) but showed greater variability, whereas ChatGPT exhibited larger deviations, sometimes producing more complex or simpler responses (<xref ref-type="fig" rid="figure3">Figure 3A, B, D and E</xref>). The multiresponse analysis improved consistency (<xref ref-type="fig" rid="figure3">Figure 3F</xref>), smoothing out extreme deviations (eg, outliers evident in <xref ref-type="fig" rid="figure3">Figure 3C</xref>) and leading to more balanced readability. Full per-question FKGL values and paired comparisons for single-instance and multiresponse data are provided in Tables S8-S11 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendices 8</xref><xref ref-type="supplementary-material" rid="app9"/><xref ref-type="supplementary-material" rid="app10"/>-<xref ref-type="supplementary-material" rid="app11">11</xref>.</p><p>A paired <italic>t</italic> test was conducted to assess the statistical significance of mean differences in FKGL scores among the expert reference answer, ChatGPT-4.0, and DeepSeek-V3 groups (<xref ref-type="table" rid="table3">Table 3</xref>). The Shapiro-Wilk test confirmed normality (<italic>P</italic>&#x003E;.05 for all groups), validating the use of the <italic>t</italic> test. Results indicated no significant differences in readability scores between the expert reference answers and those generated by ChatGPT-4.0 or DeepSeek-V3, nor between ChatGPT-4.0 and DeepSeek-V3 (<italic>P</italic>=.61 for single instance and <italic>P</italic>=.73 for multiresponse). However, high variability in scores across questions was observed. Multiresponse analyses showed smaller differences between AI models and the reference, suggesting that averaging multiple responses yields more balanced readability. Normality was further confirmed by Shapiro-Wilk <italic>P</italic> values of .16 (ChatGPT-4.0) and .82 (DeepSeek-V3). The corresponding statistical summaries are provided in Tables S8 and S10 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendices 8</xref> and <xref ref-type="supplementary-material" rid="app10">10</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Statistical comparison of readability (FKGL scores) among expert references, ChatGPT-4.0, and DeepSeek-V3 using paired <italic>t</italic> tests.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Comparison</td><td align="left" valign="bottom" colspan="4">Single-instance</td><td align="left" valign="bottom" colspan="4">Multiresponse</td></tr><tr><td align="left" valign="bottom"><italic>t</italic> test <italic>(df)</italic><sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="2">95% CI<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom"><italic>t</italic> test <italic>(df)</italic><sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="2">95% CI<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">ChatGPT-4.0 versus reference</td><td align="left" valign="top">&#x2212;0.45 (9)</td><td align="left" valign="top">.66</td><td align="left" valign="top" colspan="2">&#x2212;1.69 to 2.53</td><td align="left" valign="top">&#x2212;0.24 (9)</td><td align="left" valign="top">.82</td><td align="left" valign="top" colspan="2">&#x2212;1.40 to 1.74</td></tr><tr><td align="left" valign="top">DeepSeek-V3 versus reference</td><td align="left" valign="top">&#x2212;0.24 (9)</td><td align="left" valign="top">.81</td><td align="left" valign="top" colspan="2">&#x2212;1.58 to 1.96</td><td align="left" valign="top">&#x2212;0.40 (9)</td><td align="left" valign="top">.70</td><td align="left" valign="top" colspan="2">&#x2212;1.25 to 1.78</td></tr><tr><td align="left" valign="top">ChatGPT-4.0 versus DeepSeek-V3</td><td align="left" valign="top">0.52 (9)</td><td align="left" valign="top">.61</td><td align="left" valign="top" colspan="2">&#x2212;1.23 to 0.77</td><td align="left" valign="top">0.35 (9)</td><td align="left" valign="top">.73</td><td align="left" valign="top" colspan="2">&#x2212;0.53 to 0.72</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup><italic>t</italic> test <italic>(df)</italic>: measures the magnitude and direction of the difference between paired samples. A negative <italic>t</italic> value indicates that ChatGPT had lower scores than DeepSeek, while a positive <italic>t</italic> value indicates the opposite. <italic>t</italic> statistics are reported, all tests were 2-tailed paired <italic>t</italic> tests.  Degrees of freedom shown in parentheses, and were calculated as n&#x2212;1 for paired comparisons (<italic>df</italic>=9 for all single analyses). </p></fn><fn id="table3fn2"><p><sup>b</sup><italic>P</italic> value represents the probability of observing the test results under the null hypothesis. A <italic>P</italic> value &#x2264;.05 is considered statistically significant. </p></fn><fn id="table3fn3"><p><sup>c</sup>95% CI for the mean difference.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Content Quality Assessment</title><p>Response quality was evaluated using a 7-point Likert scale across five criteria: accuracy, clarity and readability, completeness, depth and insight, and alignment with reference answers. In the single-response analysis, ChatGPT achieved an overall mean Likert score of 5.36 (SD 0.53), whereas DeepSeek had a slightly higher mean score of 5.60 (SD 0.45) across all criteria and questions. At the criterion level, ChatGPT scored higher in alignment and accuracy, whereas DeepSeek performed slightly better in the remaining criteria (<xref ref-type="fig" rid="figure4">Figure 4A&#x2013;C</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Content quality ratings for ChatGPT-4.0 and DeepSeek-V3 across 5 criteria. (A) ChatGPT response evaluation based on the average scores of 10 answers. (B) DeepSeek response evaluation based on average score of 10 answers. (C) ChatGPT versus DeepSeek comparison based on average scores of 10 answers. (D) ChatGPT response evaluation based on 60 answers. (E) DeepSeek response evaluation based on 60 answers. (F) ChatGPT versus DeepSeek comparison based on average scores of 10 answers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e72839_fig04.png"/></fig><p>In the multiresponse analysis (dataset 2), ChatGPT achieved a mean Likert score of 6.01 (SD 0.49), while DeepSeek achieved a comparable mean score of 6.22 (SD 0.43) across all criteria and responses (<xref ref-type="fig" rid="figure4">Figure 4D&#x2013;F</xref>). Differences between the two models were minimal and not statistically significant (<italic>P</italic>&#x003E;.05), with overlapping 95% CIs for completeness, clarity &#x0026; readability, depth &#x0026; insight, and alignment with the reference answer. However, DeepSeek outperformed ChatGPT based on accuracy (ChatGPT: 5.95 vs DeepSeek: 6.25). This difference was statistically significant (t&#x2089; = &#x2212;2.377, P = .041; 95% CI &#x2212;0.585 to &#x2212;0.015), indicating consistently higher accuracy scores for DeepSeek across the 10 questions.Overall, these findings indicate that both models provide comparably high-quality responses across the assessed dimensions. Detailed summary statistics are provided in <xref ref-type="table" rid="table4">Table 4</xref>, and full per-question and per-criterion Likert ratings for both the single-instance and multiresponse datasets are provided in Table S12 in <xref ref-type="supplementary-material" rid="app12">Multimedia Appendix 12</xref> and Table S13 in <xref ref-type="supplementary-material" rid="app13">Multimedia Appendix 13</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of ChatGPT-4.0 and DeepSeek-V3 content quality across 5 evaluation criteria using paired <italic>t</italic> tests for single- and multiresponse analyses.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Evaluation criterion</td><td align="left" valign="top">ChatGPT-4.0<break/>score (mean)</td><td align="left" valign="top">DeepSeek-V3<break/>score (mean)</td><td align="left" valign="top"><italic>t</italic> test (<italic>df</italic>)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top" colspan="2">95% CI<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Single-instance results</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top" colspan="2"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="char" char="." valign="top">6.20</td><td align="char" char="." valign="top">6.30</td><td align="char" char="." valign="top">&#x2212;0.43 (9)</td><td align="char" char="." valign="top">.68</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.63 to 0.43</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Completeness</td><td align="char" char="." valign="top">5.80</td><td align="char" char="." valign="top">6.20</td><td align="char" char="." valign="top">&#x2212;1.50 (9)</td><td align="char" char="." valign="top">.17</td><td align="char" char="." valign="top" colspan="2">&#x2212;1.00 to 0.20</td></tr><tr><td align="left" valign="top">Clarity and readability</td><td align="char" char="." valign="top">6.40</td><td align="char" char="." valign="top">6.40</td><td align="char" char="." valign="top">0.00 (9)</td><td align="char" char="." valign="top">1.00</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.34 to 0.34</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Depth and insight</td><td align="char" char="." valign="top">6.60</td><td align="char" char="." valign="top">6.60</td><td align="char" char="." valign="top">0.00 (9)</td><td align="char" char="." valign="top">1.00</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.34 to 0.34</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Alignment with reference answer</td><td align="char" char="." valign="top">5.80</td><td align="char" char="." valign="top">5.90</td><td align="char" char="." valign="top">&#x2212;0.43 (9)</td><td align="char" char="." valign="top">.68</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.63 to 0.43</td></tr><tr><td align="left" valign="top" colspan="7">Multiresponse results</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="char" char="." valign="top">5.95</td><td align="char" char="." valign="top">6.25</td><td align="char" char="." valign="top">&#x2212;2.38 (9)</td><td align="char" char="." valign="top">.04</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.59 to &#x2212;0.01</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Completeness</td><td align="char" char="." valign="top">5.85</td><td align="char" char="." valign="top">6.05</td><td align="char" char="." valign="top">&#x2212;1.91 (9)</td><td align="char" char="." valign="top">.09</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.44 to 0.04</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clarity and readability</td><td align="char" char="." valign="top">6.27</td><td align="char" char="." valign="top">6.43</td><td align="char" char="." valign="top">&#x2212;1.73 (9)</td><td align="char" char="." valign="top">.12</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.38 to 0.05</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Depth and insight</td><td align="char" char="." valign="top">6.48</td><td align="char" char="." valign="top">6.65</td><td align="char" char="." valign="top">&#x2212;1.79 (9)</td><td align="char" char="." valign="top">.11</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.38 to 0.04</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Alignment with reference answer</td><td align="char" char="." valign="top">5.52</td><td align="char" char="." valign="top">5.70</td><td align="char" char="." valign="top">&#x2212;1.49 (9)</td><td align="char" char="." valign="top">.17</td><td align="char" char="." valign="top" colspan="2">&#x2212;0.46 to 0.09</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup><italic>t</italic> test: measures the magnitude and direction of the difference between paired samples. A negative <italic>t</italic> value indicates that ChatGPT had lower scores than DeepSeek, whereas a positive <italic>t</italic> value indicates the opposite, all tests were 2-tailed paired <italic>t</italic> tests Degrees of freedom shown in parentheses, and were calculated as n&#x2212;1 for paired comparisons (<italic>df</italic>=9 for all analyses).</p></fn><fn id="table4fn2"><p><sup>b</sup><italic>P</italic> value represents the probability of observing the test results under the null hypothesis. A <italic>P</italic> value  &#x2264;.05 is considered statistically significant.</p></fn><fn id="table4fn3"><p><sup>c</sup>95% CI of the mean difference.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Information Reliability Assessment</title><p>We analyzed the reliability of citations provided by ChatGPT-4.0 and DeepSeek-V3 in response to 10 clinically oriented breast cancer questions. For ChatGPT, 45 distinct references were extracted and scored by all 3 experts. Human raters showed very high internal consistency: Fleiss &#x03BA; was 0.842, indicating almost perfect agreement. Of these 45 references, 40 (88.9%) received perfect agreement from all 3 raters, and 44 (97.8%) reached majority agreement (ie, at least two raters agreed on the same label). Only 1/45 (2.2%) reference showed complete disagreement among the raters. Detailed results are provided in Table S14 in <xref ref-type="supplementary-material" rid="app14">Multimedia Appendix 14</xref>.</p><p>For DeepSeek, we evaluated 268 unique references. Interrater reliability among the three experts was even higher: Fleiss &#x03BA; was 0.935, again in the <italic>almost perfect</italic> range. Across these 268 references, 260 (97.0%) had perfect agreement, and 267 (99.6%) had majority agreement. Only 1/268 references (0.4%) showed complete disagreement. These results confirm that, despite differences in style and depth of citation between the models, human experts were able to judge source reliability in a highly consistent way across both datasets. All details are provided in Table S15 in <xref ref-type="supplementary-material" rid="app15">Multimedia Appendix 15</xref>.</p><p>We then assessed how well each AI model&#x2019;s citation-level reliability scores aligned with human judgment (ie, each reference set was evaluated by both ChatGPT and DeepSeek as reported in Table S11 and Table S12 in <xref ref-type="supplementary-material" rid="app11">Multimedia Appendices 11</xref> and <xref ref-type="supplementary-material" rid="app12">12</xref>, respectively). For the ChatGPT reference set, agreement between ChatGPT&#x2019;s scores and the human majority score was substantial (Cohen &#x03BA;=0.665). For the DeepSeek reference set, agreement between DeepSeek&#x2019;s scores and the human majority score was higher (&#x03BA;=0.782). When disagreement occurred, ChatGPT differed from the human majority on 7 of its 45 references, whereas DeepSeek differed on 32 of its 268 references. These agreement metrics and disagreement profiles are summarized in Table S14 and Table 15 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendices 7</xref> and <xref ref-type="supplementary-material" rid="app14">14</xref>, respectively.</p><p>To enable direct comparison, we also evaluated each model on the other model&#x2019;s reference set. In the ChatGPT set, DeepSeek&#x2019;s scoring of those same sources achieved &#x03BA;=0.800, which exceeded ChatGPT&#x2019;s own &#x03BA;=0.665<italic>.</italic> In the DeepSeek set, ChatGPT&#x2019;s scoring of DeepSeek&#x2019;s citations achieved &#x03BA;=0.600, which was lower than DeepSeek&#x2019;s &#x03BA;=0.782. In other words, when both models were evaluated against the same expert consensus, DeepSeek was consistently closer to the human majority than ChatGPT.</p><p>Qualitatively, ChatGPT tended to cite large US medical organizations and public-facing health information portals (eg, national cancer societies, specialty associations, and patient education sites). These sources were commonly accepted as reliable when they represented established clinical authorities, but they occasionally triggered disagreement when they originated from advocacy foundations or commercial health media. DeepSeek more often cited recent peer-reviewed literature, systematic reviews, and oncology-specific clinical studies, which generally scored as reliable, although some entries contained incomplete metadata (eg, partial author or year strings or nonresolving links) that required manual adjudication.</p><p>Taken together, these results show 3 things. First, human expert scoring of reference quality was internally very stable for both models (Fleiss &#x03BA;=0.842 for ChatGPT&#x2019;s reference set and 0.935 for DeepSeek&#x2019;s reference set) with majority agreement observed for 44/45 (97.8%) and 267/268 (99.6%) references, respectively, indicating that <italic>reliable versus unreliable</italic> source is a reproducible judgment in this context. Second, both AI systems demonstrated substantial alignment with expert consensus, but DeepSeek exhibited stronger agreement with the human majority than ChatGPT (&#x03BA; up to 0.800 on the ChatGPT set and 0.782 on its own set, versus ChatGPT&#x2019;s &#x03BA; of 0.665 and 0.600<italic>,</italic> respectively). Third, the main source of residual disagreement was concentrated in borderline cases, such as advocacy organization content, institutional news posts, and partially specified citations, rather than in conventional peer-reviewed oncology literature (<xref ref-type="table" rid="table5">Table 5</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Summary of citation reliability and agreement metrics for ChatGPT-4.0 and DeepSeek-V3.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">ChatGPT-4.0</td><td align="left" valign="bottom">DeepSeek-V3</td></tr></thead><tbody><tr><td align="left" valign="top">References evaluated, N</td><td align="left" valign="top">45</td><td align="left" valign="top">268</td></tr><tr><td align="left" valign="top">Reliability score (majority human; mean)</td><td align="left" valign="top">0.64</td><td align="left" valign="top">0.82</td></tr><tr><td align="left" valign="top">Fleiss &#x03BA; (inter-rater agreement)</td><td align="left" valign="top">0.842</td><td align="left" valign="top">0.935</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perfect agreement (all 3 raters assigned the same label), n/N (%)</td><td align="left" valign="top">40/45 (88.9)</td><td align="left" valign="top">260/268 (97.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority agreement (&#x2265;2 of 3 raters assigned the same label), n/N (%)</td><td align="left" valign="top">44/45 (97.8)</td><td align="left" valign="top">267/268 (99.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complete disagreement (all 3 raters assigned different labels), n/N (%)</td><td align="left" valign="top">1/45 (2.2)</td><td align="left" valign="top">1/268 (0.4)</td></tr><tr><td align="left" valign="top">Cohen &#x03BA; (AI vs human majority)</td><td align="left" valign="top">0.665</td><td align="left" valign="top">0.782</td></tr><tr><td align="left" valign="top">Cross-evaluation &#x03BA; (AI scoring the other model&#x2019;s references)</td><td align="left" valign="top">ChatGPT on DeepSeek=0.600</td><td align="left" valign="top">DeepSeek on ChatGPT=0.800</td></tr></tbody></table></table-wrap><p>The mean reliability score (majority human) is the mean binary reliability score (1=reliable and 0=unreliable) assigned by the human majority (&#x2265;2 of 3 expert raters) across all unique references cited by each model. &#x201C;Fleiss &#x03BA;&#x201D; quantifies agreement among the 3 human experts for each model&#x2019;s reference set. &#x201C;Perfect agreement&#x201D; is the proportion of references where all 3 experts independently gave the same reliability score. &#x201C;Majority agreement&#x201D; is the proportion of references where at least two of the 3 experts agreed. &#x201C;Complete disagreement&#x201D; refers to cases in which all 3 experts gave different scores. &#x201C;Cohen &#x03BA; (AI vs human majority score)&#x201D; measures agreement between each model&#x2019;s own reliability classification of its citations and the human majority classification for the same reference set. &#x201C;Cross-evaluation &#x03BA;&#x201D; measures how well each model&#x2019;s scoring of the other model&#x2019;s references agreed with the human majority for that reference set (eg, &#x201C;DeepSeek on ChatGPT&#x201D;=DeepSeek&#x2019;s &#x03BA; when scoring the ChatGPT reference set). All underlying data are provided in Table S14 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> (ChatGPT-4.0) and Table S15 in <xref ref-type="supplementary-material" rid="app14">Multimedia Appendices 14</xref> and <xref ref-type="supplementary-material" rid="app15">15</xref> (DeepSeek-V3).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study systematically evaluated the performance of ChatGPT-4.0 and DeepSeek-V3 in generating breast cancer information, focusing on readability, content quality, and reference quality. This multidimensional assessment approach (<xref ref-type="fig" rid="figure1">Figure 1</xref>), which is rarely addressed collectively in existing literature, offers valuable insights into the capabilities and limitations of generative AI models in health care. Both models generated clinically relevant and readable responses, with ChatGPT-4.0 showing greater consistency and overall readability, whereas DeepSeek-V3 offered more diverse citations but with greater variability and occasional technical issues.</p></sec><sec id="s4-2"><title>Readability Analysis</title><p>The FKGL readability scores revealed that AI-generated responses generally used simpler language structures than expert reference answers, which may enhance accessibility for general audiences. Expert responses had the highest FKGL scores (mean 10.53, SD 2.17), reflecting more complex vocabulary and sentence construction. In contrast, both ChatGPT and DeepSeek produced responses with lower FKGL scores, indicating simpler, more readable content.</p><p>This trend aligns with prior studies showing that LLMs often generate linguistically simplified outputs, which may benefit general users [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Notably, ChatGPT-4.0 exhibited more consistent readability across multiple outputs, with a median FKGL of 10.66 (IQR 0.98), compared to DeepSeek-V3, which showed greater variability with a median FKGL of 10.17 (IQR 1.41), suggesting greater stability with repeated querying.</p><p>For instance, ChatGPT-4.0 demonstrated improved readability consistency when averaging multiple responses, suggesting stability with repeated querying. In contrast, DeepSeek-V3&#x2019;s responses were closer to expert reference readability levels in single-instance analysis but displayed greater variability in multiresponse cases (IQR 1.41).</p><p>However, statistical comparisons did not reveal significant differences in readability between models or relative to expert references. For instance, in the multiresponse setting, the paired <italic>t</italic> test for ChatGPT-4.0 versus reference yielded <italic>t<sub>9</sub></italic>=&#x2212;0.24; <italic>P</italic>=.82 (95% CI &#x2212;1.40 to 1.74), and for DeepSeek-V3 versus reference, <italic>t<sub>9</sub></italic>=&#x2212;0.40; <italic>P</italic>=.70 (95% CI &#x2212;1.25 to 1.78). The difference between ChatGPT and DeepSeek was also nonsignificant (<italic>t</italic><sub>9</sub>=0.35; <italic>P</italic>=.73). Similar nonsignificant findings were observed in the single-instance analysis.</p><p>It is essential to emphasize that FKGL measures linguistic complexity, not clinical accuracy or information richness [<xref ref-type="bibr" rid="ref35">35</xref>]. Lower FKGL scores do not necessarily indicate that critical medical details were omitted. Therefore, while improved readability may facilitate comprehension, further analysis is necessary to confirm that the content remains accurate, complete, and clinically appropriate.</p></sec><sec id="s4-3"><title>Content Quality Analysis</title><p>Beyond readability, content quality was assessed across 5 predefined evaluation criteria, including accuracy, completeness, clarity, depth and insight, and alignment with expert reference answers. This structured evaluation, based on established frameworks [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], allowed a more nuanced assessment of response quality. Across analyses, both models achieved high scores across all criteria. In the multiresponse dataset, DeepSeek achieved a higher overall mean score (6.22, SD 0.43) than ChatGPT (6.01, SD 0.49), with DeepSeek scoring numerically higher across all five criteria. Notably, DeepSeek demonstrated a statistically significant advantage in accuracy in the multiresponse analysis (t&#x2089; = &#x2212;2.377, <italic>P</italic> = .041; 95% CI &#x2212;0.585 to &#x2212;0.015), while differences for completeness, clarity and readability, depth and insight, and alignment with the reference answer did not reach statistical significance (P &#x003E; .05). Interestingly, averaging multiple responses per question reduced response-level variability and revealed a more consistent performance advantage for DeepSeek, particularly for accuracy.</p><p>These findings expand on earlier studies that primarily focused on ChatGPT-3.5 [<xref ref-type="bibr" rid="ref33">33</xref>] by demonstrating the evolving strengths and remaining limitations of newer generative AI models when applied to clinically relevant educational content. Unlike prior studies limited to single-response analysis, our multiresponse approach highlights how response variability remains an important factor when evaluating AI for health care education.</p></sec><sec id="s4-4"><title>Information Reliability Assessment</title><p>The reference reliability analysis revealed clear differences in both reference selection patterns and interrater scoring agreement for ChatGPT-4.0 and DeepSeek-V3 citations. ChatGPT tended to cite US-based medical institutions, national health organizations, and patient-facing educational resources, whereas DeepSeek more often cited recent peer-reviewed oncology studies and systematic reviews, although some DeepSeek citations were partially specified or contained inaccessible links. Human experts showed very consistent judgments of citation reliability for both models&#x2019; outputs: interrater reliability was almost perfect for both sets (Fleiss &#x03BA;=0.842 for ChatGPT&#x2019;s 45 references and 0.935 for DeepSeek&#x2019;s 268 references), with &#x2265;97% majority agreement and only one fully disputed reference in each set. When we compared each model&#x2019;s scoring of source reliability to the majority expert score, agreement was substantial for ChatGPT and higher for DeepSeek. DeepSeek reached Cohen &#x03BA; values up to 0.800 (for ChatGPT reference set) and 0.782 (for DeepSeek reference set), indicating strong alignment with expert consensus, whereas ChatGPT&#x2019;s ability to assess references reached 0.665 for its own reference set and 0.600 when evaluated DeepSeek&#x2019;s set.</p><p>Collectively, these results demonstrate that both models operate within the bounds of medically acceptable sourcing but use distinct strategies. DeepSeek&#x2019;s output, favoring primary literature, more accurately reflects the evidence-based curation of oncology experts. Conversely, ChatGPT&#x2019;s propensity for synthesized institutional guidance results in a higher frequency of citations to borderline sources that require expert judgment.</p></sec><sec id="s4-5"><title>Implications for Practice and Future Research</title><p>Collectively, these findings suggest that while generative AI models offer promising capabilities for generating patient education materials, important limitations remain. Oversimplification of complex medical information, variability in response consistency, and the reliability of cited sources remain persistent concerns. Given the growing interest in deploying AI tools for patient counseling, clinical decision support, and health care communication, these results underscore the need for careful validation of AI-generated content before clinical use [<xref ref-type="bibr" rid="ref48">48</xref>]. Furthermore, while models such as DeepSeek demonstrate the potential for integrating primary literature more effectively, consistent quality control of both content and source reliability will be essential for safe application in clinical settings.</p><p>Finally, the structured multidimensional evaluation framework used in this study provides a replicable approach for future evaluations of AI models in health care. Future studies should continue to assess performance across broader clinical domains, expand evaluation across different languages and health literacy levels, and integrate clinical expert validation to further ensure the safety, accuracy, and equity of AI-generated medical information.</p></sec><sec id="s4-6"><title>Limitations and Future Directions</title><p>This study has several limitations. The FKGL test assessed structural readability but not contextual or clinical accuracy, and it is validated only for English, limiting generalizability across languages and literacy levels [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Additionally, the study was conducted in a controlled setting without patient or provider involvement, so real-world usability, comprehension, and clinical outcomes remain unexplored. Future work should include usability testing and assess impacts on decision-making and adherence [<xref ref-type="bibr" rid="ref49">49</xref>]. Moreover, AI responses were not compared directly to clinical guidelines, and future studies should include validation against established standards before clinical integration. Reference accuracy remains a concern, as AI models may generate outdated or fabricated citations, requiring expert oversight [<xref ref-type="bibr" rid="ref50">50</xref>].</p><p>Furthermore, the risk of bias in training data may lead to inaccuracies for underrepresented populations, highlighting the importance of ongoing monitoring and alignment with validated medical sources [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Evidence from clinical research, including variability in therapeutic outcomes across patient subgroups and molecular subtypes, further emphasizes the necessity for such validation [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. Finally, ethical and regulatory considerations are essential for safe deployment of AI in health care; compliance with frameworks from regulatory bodies such as the Food and Drug Administration and the European Medicines Agency will be critical to ensure safety, transparency, and public trust [<xref ref-type="bibr" rid="ref53">53</xref>]. As these technologies continue to evolve, continuous evaluation, comparative benchmarking, and expert oversight will be necessary to guide their responsible implementation.</p></sec><sec id="s4-7"><title>Conclusions</title><p>This comparative analysis of ChatGPT-4.0 and DeepSeek-V3 demonstrated that both AI platforms effectively retrieve and present medical information on breast cancer, each excelling in different areas. ChatGPT produced more polished, detailed, and readable responses, with strong performance in clarity and completeness, whereas DeepSeek outperformed ChatGPT on accuracy, and provided more comprehensive, globally diverse references with in-text citations resembling academic articles. However, DeepSeek faced challenges such as untagged links, occasional downtime, and corrupted references, which impacted the user experience. Despite these limitations, DeepSeek demonstrated superior citation efficiency and closely aligned with expert consensus answers. Moreover, interrater reliability analyses confirmed high levels of agreement among human evaluators across both models, with Fleiss &#x03BA; scores exceeding 0.84 and Cohen &#x03BA; values indicating substantial to almost perfect agreement. These findings support the overall quality of reference sources provided by the models and lend credibility to the evaluation framework used in this study. Furthermore, statistical analysis revealed no significant differences between the models across most evaluation criteria; however, in the multiresponse dataset (i.e., larger datasets), DeepSeek demonstrated a statistically significant advantage in accuracy, underscoring the importance of rigorous evaluation of AI-generated medical information to ensure accuracy, accessibility, and reliability. To optimize their effectiveness in health care applications, future improvements should focus on enhancing platform stability, response consistency, and overall user accessibility.</p></sec></sec></body><back><ack><p>All authors declared that they had insufficient funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee (APF) support for the publication of this article.</p></ack><notes><sec><title>Funding</title><p>RH and DAS acknowledge funding provided by the Deanship of Scientific Research at Al-Zaytoonah University of Jordan (grant 2023-2022/17/50 and 2025-2024/06/29).</p></sec><sec><title>Data Availability</title><p>All data generated or analyzed during this study, including AI-generated responses, reference metadata, and interrater reliability analyses, are publicly available in a GitHub repository [<xref ref-type="bibr" rid="ref54">54</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: RH, SKB</p><p>Data Curation: RH, DAS, SKB</p><p>Formal analysis: RH</p><p>Methodology: RH, DAS, SKB</p><p>Writing &#x2013; original draft: RH, DAS, SKB</p><p>Writing &#x2013; review &#x0026; editing: RH, DAS, SKB</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations:</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">FAQ</term><def><p>frequently asked question</p></def></def-item><def-item><term id="abb3">FKGL</term><def><p>Flesch-Kincaid Grade Level</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ferlay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>RL</given-names> </name><etal/></person-group><article-title>Global cancer statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA Cancer J Clin</source><year>2021</year><month>05</month><volume>71</volume><issue>3</issue><fpage>209</fpage><lpage>249</lpage><pub-id pub-id-type="doi">10.3322/caac.21660</pub-id><pub-id pub-id-type="medline">33538338</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sinha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Naskar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rautaray</surname><given-names>SS</given-names> </name></person-group><article-title>Challenges to the early diagnosis of breast cancer: current scenario and the challenges ahead</article-title><source>SN COMPUT SCI</source><year>2024</year><volume>5</volume><issue>1</issue><fpage>170</fpage><pub-id pub-id-type="doi">10.1007/s42979-023-02534-1</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sunoqrot</surname><given-names>S</given-names> </name><name name-style="western"><surname>Abusulieh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Abusara</surname><given-names>OH</given-names> </name></person-group><article-title>Identifying synergistic combinations of doxorubicin-loaded polyquercetin nanoparticles and natural products: implications for breast cancer therapy</article-title><source>Int J Pharm</source><year>2023</year><month>10</month><day>15</day><volume>645</volume><fpage>123392</fpage><pub-id pub-id-type="doi">10.1016/j.ijpharm.2023.123392</pub-id><pub-id pub-id-type="medline">37683979</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sweidan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Elfadel</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sabbah</surname><given-names>DA</given-names> </name><etal/></person-group><article-title>Novel derivatives of 4,6&#x2010;dihydroxy&#x2010;2&#x2010;quinolone&#x2010;3&#x2010;carboxamides as potential PI3K&#x03B1; inhibitors</article-title><source>ChemistrySelect</source><year>2022</year><month>08</month><day>26</day><volume>7</volume><issue>32</issue><comment><ext-link ext-link-type="uri" xlink:href="https://chemistry-europe.onlinelibrary.wiley.com/toc/23656549/7/32">https://chemistry-europe.onlinelibrary.wiley.com/toc/23656549/7/32</ext-link></comment><pub-id pub-id-type="doi">10.1002/slct.202202263</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="book"><source>Global Breast Cancer Initiative Implementation Framework: Assessing, Strengthening and Scaling-up of Services for the Early Detection and Management of Breast Cancer</source><year>2023</year><access-date>2026-01-14</access-date><publisher-name>World Health Organization</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/publications/i/item/9789240067134">https://www.who.int/publications/i/item/9789240067134</ext-link></comment><pub-id pub-id-type="other">978-92-4-006713-4</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kasper</surname><given-names>G</given-names> </name><name name-style="western"><surname>Momen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sorice</surname><given-names>KA</given-names> </name><etal/></person-group><article-title>Effect of neighborhood and individual-level socioeconomic factors on breast cancer screening adherence in a multiethnic study</article-title><source>BMC Public Health</source><year>2024</year><month>01</month><day>2</day><volume>24</volume><issue>1</issue><fpage>63</fpage><pub-id pub-id-type="doi">10.1186/s12889-023-17252-9</pub-id><pub-id pub-id-type="medline">38166942</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>C</given-names> </name></person-group><article-title>Online health information seeking behavior among breast cancer patients and survivors: a scoping review</article-title><source>BMC Womens Health</source><year>2025</year><month>01</month><day>3</day><volume>25</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1186/s12905-024-03509-x</pub-id><pub-id pub-id-type="medline">39754199</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Loeb</surname><given-names>S</given-names> </name><name name-style="western"><surname>Langford</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Bragg</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Sherman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>JM</given-names> </name></person-group><article-title>Cancer misinformation on social media</article-title><source>CA Cancer J Clin</source><year>2024</year><volume>74</volume><issue>5</issue><fpage>453</fpage><lpage>464</lpage><pub-id pub-id-type="doi">10.3322/caac.21857</pub-id><pub-id pub-id-type="medline">38896503</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Parsons</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dorff</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Cancer misinformation and harmful information on Facebook and other social media: a brief report</article-title><source>J Natl Cancer Inst</source><year>2022</year><month>07</month><day>11</day><volume>114</volume><issue>7</issue><fpage>1036</fpage><lpage>1039</lpage><pub-id pub-id-type="doi">10.1093/jnci/djab141</pub-id><pub-id pub-id-type="medline">34291289</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walker</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Ghani</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kuemmerli</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Reliability of medical information provided by ChatGPT: assessment against clinical guidelines and patient information quality instrument</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>30</day><volume>25</volume><fpage>e47479</fpage><pub-id pub-id-type="doi">10.2196/47479</pub-id><pub-id pub-id-type="medline">37389908</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dave</surname><given-names>T</given-names> </name><name name-style="western"><surname>Athaluri</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations</article-title><source>Front Artif Intell</source><year>2023</year><volume>6</volume><fpage>1169595</fpage><pub-id pub-id-type="doi">10.3389/frai.2023.1169595</pub-id><pub-id pub-id-type="medline">37215063</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YS</given-names> </name><etal/></person-group><article-title>Availability of ChatGPT to provide medical information for patients with kidney cancer</article-title><source>Sci Rep</source><year>2024</year><volume>14</volume><issue>1</issue><fpage>1542</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-51531-8</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Malin</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Rousseau</surname><given-names>JF</given-names> </name><etal/></person-group><article-title>From GPT to DeepSeek: significant gaps remain in realizing AI in health care</article-title><source>J Biomed Inform</source><year>2025</year><month>03</month><volume>163</volume><fpage>104791</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2025.104791</pub-id><pub-id pub-id-type="medline">39938624</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kincaid</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Braby</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mears</surname><given-names>JE</given-names> </name></person-group><article-title>Electronic authoring and delivery of technical information</article-title><source>J Instr Dev</source><year>1988</year><month>06</month><volume>11</volume><issue>2</issue><fpage>8</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1007/BF02904998</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sullivan</surname><given-names>GM</given-names> </name><name name-style="western"><surname>Artino</surname><given-names>AR</given-names>  <suffix>Jr</suffix></name></person-group><article-title>Analyzing and interpreting data from likert-type scales</article-title><source>J Grad Med Educ</source><year>2013</year><month>12</month><volume>5</volume><issue>4</issue><fpage>541</fpage><lpage>542</lpage><pub-id pub-id-type="doi">10.4300/JGME-5-4-18</pub-id><pub-id pub-id-type="medline">24454995</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>AlZu&#x2019;bi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zreiqat</surname><given-names>A</given-names> </name><name name-style="western"><surname>Radi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Mughaid</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abualigah</surname><given-names>L</given-names> </name></person-group><article-title>An intelligent health care monitoring system-based novel deep learning approach for detecting covid-19 from x-rays images</article-title><source>Multimed Tools Appl</source><year>2024</year><volume>83</volume><issue>23</issue><fpage>63479</fpage><lpage>63496</lpage><pub-id pub-id-type="doi">10.1007/s11042-023-18056-0</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muhairat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Alzyadat</surname><given-names>W</given-names> </name><name name-style="western"><surname>Shaheen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alhroob</surname><given-names>A</given-names> </name><name name-style="western"><surname>Asfour</surname><given-names>AN</given-names> </name></person-group><article-title>Leveraging machine learning for predictive pathways in higher education: a case study at Al-Zaytoonah University of Jordan</article-title><source>SSRG-IJECE</source><year>2024</year><volume>11</volume><issue>11</issue><fpage>28</fpage><lpage>44</lpage><pub-id pub-id-type="doi">10.14445/23488549/IJECE-V11I11P104</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jarab</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Al-Qerem</surname><given-names>W</given-names> </name><name name-style="western"><surname>Al-Hajjeh</surname><given-names>DM</given-names> </name><etal/></person-group><article-title>Artificial intelligence utilization in the health care setting: perceptions of the public in the UAE</article-title><source>Int J Environ Health Res</source><year>2025</year><month>03</month><volume>35</volume><issue>3</issue><fpage>585</fpage><lpage>593</lpage><pub-id pub-id-type="doi">10.1080/09603123.2024.2363472</pub-id><pub-id pub-id-type="medline">38832887</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buch</surname><given-names>VH</given-names> </name><name name-style="western"><surname>Ahmed</surname><given-names>I</given-names> </name><name name-style="western"><surname>Maruthappu</surname><given-names>M</given-names> </name></person-group><article-title>Artificial intelligence in medicine: current trends and future possibilities</article-title><source>Br J Gen Pract</source><year>2018</year><month>03</month><volume>68</volume><issue>668</issue><fpage>143</fpage><lpage>144</lpage><pub-id pub-id-type="doi">10.3399/bjgp18X695213</pub-id><pub-id pub-id-type="medline">29472224</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>AI in health and medicine</article-title><source>Nat Med</source><year>2022</year><month>01</month><volume>28</volume><issue>1</issue><fpage>31</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01614-0</pub-id><pub-id pub-id-type="medline">35058619</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dilsizian</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>EL</given-names> </name></person-group><article-title>Artificial intelligence in medicine and cardiac imaging: harnessing big data and advanced computing to provide personalized medical diagnosis and treatment</article-title><source>Curr Cardiol Rep</source><year>2014</year><month>01</month><volume>16</volume><issue>1</issue><fpage>441</fpage><pub-id pub-id-type="doi">10.1007/s11886-013-0441-8</pub-id><pub-id pub-id-type="medline">24338557</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malik</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pathania</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rathaur</surname><given-names>VK</given-names> </name></person-group><article-title>Overview of artificial intelligence in medicine</article-title><source>J Family Med Prim Care</source><year>2019</year><month>07</month><volume>8</volume><issue>7</issue><fpage>2328</fpage><lpage>2331</lpage><pub-id pub-id-type="doi">10.4103/jfmpc.jfmpc_440_19</pub-id><pub-id pub-id-type="medline">31463251</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sahu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ambasta</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>P</given-names> </name></person-group><article-title>Artificial intelligence and machine learning in precision medicine: a paradigm shift in big data analysis</article-title><source>Prog Mol Biol Transl Sci</source><year>2022</year><volume>190</volume><issue>1</issue><fpage>57</fpage><lpage>100</lpage><pub-id pub-id-type="doi">10.1016/bs.pmbts.2022.03.002</pub-id><pub-id pub-id-type="medline">36008002</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hajjo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sabbah</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Al Bawab</surname><given-names>AQ</given-names> </name></person-group><article-title>Unlocking the potential of the human microbiome for identifying disease diagnostic biomarkers</article-title><source>Diagnostics (Basel)</source><year>2022</year><month>07</month><day>19</day><volume>12</volume><issue>7</issue><fpage>1742</fpage><pub-id pub-id-type="doi">10.3390/diagnostics12071742</pub-id><pub-id pub-id-type="medline">35885645</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boscardin</surname><given-names>CK</given-names> </name><name name-style="western"><surname>Gin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Golde</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Hauer</surname><given-names>KE</given-names> </name></person-group><article-title>ChatGPT and generative artificial intelligence for medical education: potential impact and opportunity</article-title><source>Acad Med</source><year>2024</year><month>01</month><day>1</day><volume>99</volume><issue>1</issue><fpage>22</fpage><lpage>27</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005439</pub-id><pub-id pub-id-type="medline">37651677</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hajjo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sabbah</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Bardaweel</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Tropsha</surname><given-names>A</given-names> </name></person-group><article-title>Identification of tumor-specific MRI biomarkers using machine learning (ML)</article-title><source>Diagnostics (Basel)</source><year>2021</year><month>04</month><day>21</day><volume>11</volume><issue>5</issue><fpage>742</fpage><pub-id pub-id-type="doi">10.3390/diagnostics11050742</pub-id><pub-id pub-id-type="medline">33919342</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wenderott</surname><given-names>K</given-names> </name><name name-style="western"><surname>Krups</surname><given-names>J</given-names> </name><name name-style="western"><surname>Weigl</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wooldridge</surname><given-names>AR</given-names> </name></person-group><article-title>Facilitators and barriers to implementing AI in routine medical imaging: systematic review and qualitative analysis</article-title><source>J Med Internet Res</source><year>2025</year><month>07</month><day>21</day><volume>27</volume><fpage>e63649</fpage><pub-id pub-id-type="doi">10.2196/63649</pub-id><pub-id pub-id-type="medline">40690758</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>YM</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>YM</given-names> </name></person-group><article-title>Key elements and theoretical foundations for the design and delivery of text messages to boost medication adherence in patients with diabetes, hypertension, and hyperlipidemia: scoping review</article-title><source>J Med Internet Res</source><year>2025</year><month>07</month><day>21</day><volume>27</volume><fpage>e71982</fpage><pub-id pub-id-type="doi">10.2196/71982</pub-id><pub-id pub-id-type="medline">40690759</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lai</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>TH</given-names> </name></person-group><article-title>Predicting pathological complete response following neoadjuvant therapy in patients with breast cancer: development of machine learning-based prediction models in a retrospective study</article-title><source>JMIR Cancer</source><year>2025</year><month>07</month><day>18</day><volume>11</volume><fpage>e64685</fpage><pub-id pub-id-type="doi">10.2196/64685</pub-id><pub-id pub-id-type="medline">40680158</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Efficacy of Fuzheng Quxie formula against postoperative metastasis of lung cancer in stage IIA-IIIA with negative driver genes: protocol for a multicenter, double-blind, randomized controlled trial</article-title><source>JMIR Res Protoc</source><year>2025</year><month>10</month><day>24</day><volume>14</volume><fpage>e66342</fpage><pub-id pub-id-type="doi">10.2196/66342</pub-id><pub-id pub-id-type="medline">41135941</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ye</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><etal/></person-group><article-title>An assessment of ChatGPT&#x2019;s responses to frequently asked questions about cervical and breast cancer</article-title><source>BMC Womens Health</source><year>2024</year><month>09</month><day>2</day><volume>24</volume><issue>1</issue><fpage>482</fpage><pub-id pub-id-type="doi">10.1186/s12905-024-03320-8</pub-id><pub-id pub-id-type="medline">39223612</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>A shocking chinese AI advancement called deepseek is sending US stocks plunging</article-title><source>CNN Business</source><access-date>2025-12-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://edition.cnn.com/2025/01/27/tech/deepseek-stocks-ai-china/index.html">https://edition.cnn.com/2025/01/27/tech/deepseek-stocks-ai-china/index.html</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Korea biomedical review</article-title><source>The DeepSeek dilemma: navigating innovation and security concerns in Korea&#x2019;s health care industry</source><access-date>2025-02-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.koreabiomed.com/news/articleView.html?idxno=26581">https://www.koreabiomed.com/news/articleView.html?idxno=26581</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>FLESCH</surname><given-names>R</given-names> </name></person-group><article-title>A new readability yardstick</article-title><source>J Appl Psychol</source><year>1948</year><month>06</month><volume>32</volume><issue>3</issue><fpage>221</fpage><lpage>233</lpage><pub-id pub-id-type="doi">10.1037/h0057532</pub-id><pub-id pub-id-type="medline">18867058</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Badarudeen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sabharwal</surname><given-names>S</given-names> </name></person-group><article-title>Readability of patient education materials from the American Academy of Orthopaedic Surgeons and Pediatric Orthopaedic Society of North America web sites</article-title><source>J Bone Joint Surg Am</source><year>2008</year><month>01</month><volume>90</volume><issue>1</issue><fpage>199</fpage><lpage>204</lpage><pub-id pub-id-type="doi">10.2106/JBJS.G.00347</pub-id><pub-id pub-id-type="medline">18171975</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>Textstat</article-title><source>Python Package Index</source><access-date>2025-12-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/textstat/">https://pypi.org/project/textstat/</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Virtanen</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gommers</surname><given-names>R</given-names> </name><name name-style="western"><surname>Oliphant</surname><given-names>TE</given-names> </name><etal/></person-group><article-title>SciPy 1.0: fundamental algorithms for scientific computing in Python</article-title><source>Nat Methods</source><year>2020</year><month>03</month><volume>17</volume><issue>3</issue><fpage>261</fpage><lpage>272</lpage><pub-id pub-id-type="doi">10.1038/s41592-019-0686-2</pub-id><pub-id pub-id-type="medline">32015543</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>SB</given-names> </name><name name-style="western"><surname>King</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Warner</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Aneja</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kann</surname><given-names>BH</given-names> </name><name name-style="western"><surname>Bylund</surname><given-names>CL</given-names> </name></person-group><article-title>Using ChatGPT to evaluate cancer myths and misconceptions: artificial intelligence and cancer information</article-title><source>JNCI Cancer Spectr</source><year>2023</year><month>03</month><day>1</day><volume>7</volume><issue>2</issue><fpage>pkad015</fpage><pub-id pub-id-type="doi">10.1093/jncics/pkad015</pub-id><pub-id pub-id-type="medline">36929393</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haver</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Ambinder</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Bahl</surname><given-names>M</given-names> </name><name name-style="western"><surname>Oluyemi</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Jeudy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>PH</given-names> </name></person-group><article-title>Appropriateness of breast cancer prevention and screening recommendations provided by ChatGPT</article-title><source>Radiology</source><year>2023</year><month>05</month><volume>307</volume><issue>4</issue><fpage>e230424</fpage><pub-id pub-id-type="doi">10.1148/radiol.230424</pub-id><pub-id pub-id-type="medline">37014239</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shapiro</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Wilk</surname><given-names>MB</given-names> </name></person-group><article-title>An analysis of variance test for normality (complete samples)</article-title><source>Biometrika</source><year>1965</year><month>12</month><volume>52</volume><issue>3/4</issue><fpage>591</fpage><pub-id pub-id-type="doi">10.2307/2333709</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arnastauskait&#x0117;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ruzgas</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bra&#x017E;&#x0117;nas</surname><given-names>M</given-names> </name></person-group><article-title>An exhaustive power comparison of normality tests</article-title><source>Mathematics</source><year>2021</year><volume>9</volume><issue>7</issue><fpage>788</fpage><pub-id pub-id-type="doi">10.3390/math9070788</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruxton</surname><given-names>GD</given-names> </name></person-group><article-title>The unequal variance t-test is an underused alternative to Student&#x2019;s t-test and the Mann&#x2013;Whitney U test</article-title><source>Behav Ecol</source><year>2006</year><month>07</month><day>1</day><volume>17</volume><issue>4</issue><fpage>688</fpage><lpage>690</lpage><pub-id pub-id-type="doi">10.1093/beheco/ark016</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fleiss</surname><given-names>JL</given-names> </name></person-group><article-title>Measuring nominal scale agreement among many raters</article-title><source>Psychol Bull</source><year>1971</year><volume>76</volume><issue>5</issue><fpage>378</fpage><lpage>382</lpage><pub-id pub-id-type="doi">10.1037/h0031619</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harris</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Millman</surname><given-names>KJ</given-names> </name><name name-style="western"><surname>van der Walt</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>Array programming with NumPy</article-title><source>Nature New Biol</source><year>2020</year><month>09</month><volume>585</volume><issue>7825</issue><fpage>357</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1038/s41586-020-2649-2</pub-id><pub-id pub-id-type="medline">32939066</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="web"><article-title>Scikit-learn: machine learning in python &#x2014; scikit-learn 170 documentation</article-title><source>Scikit-learn</source><access-date>2025-12-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://scikit-learn.org/stable/">https://scikit-learn.org/stable/</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="web"><article-title>Python data analysis library</article-title><source>pandas</source><access-date>2025-12-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pandas.pydata.org/">https://pandas.pydata.org/</ext-link></comment></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vellido</surname><given-names>A</given-names> </name></person-group><article-title>Societal issues concerning the application of artificial intelligence in medicine</article-title><source>Kidney Dis (Basel)</source><year>2019</year><month>02</month><volume>5</volume><issue>1</issue><fpage>11</fpage><lpage>17</lpage><pub-id pub-id-type="doi">10.1159/000492428</pub-id><pub-id pub-id-type="medline">30815459</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kitsios</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kamariotou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Syngelakis</surname><given-names>AI</given-names> </name><name name-style="western"><surname>Talias</surname><given-names>MA</given-names> </name></person-group><article-title>Recent advances of artificial intelligence in health care: a systematic literature review</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>13</issue><fpage>7479</fpage><pub-id pub-id-type="doi">10.3390/app13137479</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hajjo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sabbah</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Bardaweel</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Zhong</surname><given-names>HA</given-names> </name></person-group><article-title>Targeting the EGFR/RAS/RAF signaling pathway in anticancer research: a recent update on inhibitor design and clinical trials (2020-2023)</article-title><source>Expert Opin Ther Pat</source><year>2024</year><volume>34</volume><issue>1-2</issue><fpage>51</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1080/13543776.2024.2327307</pub-id><pub-id pub-id-type="medline">38450537</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sabbah</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Hajjo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bardaweel</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Zhong</surname><given-names>HA</given-names> </name></person-group><article-title>Targeting the PI3K/AKT signaling pathway in anticancer research: a recent update on inhibitor design and clinical trials (2020-2023)</article-title><source>Expert Opin Ther Pat</source><year>2024</year><month>03</month><volume>34</volume><issue>3</issue><fpage>141</fpage><lpage>158</lpage><pub-id pub-id-type="doi">10.1080/13543776.2024.2338100</pub-id><pub-id pub-id-type="medline">38557273</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pasas-Farmer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>R</given-names> </name></person-group><article-title>From discovery to delivery: governance of AI in the pharmaceutical industry</article-title><source>Green Analytical Chemistry</source><year>2025</year><month>06</month><volume>13</volume><fpage>100268</fpage><pub-id pub-id-type="doi">10.1016/j.greeac.2025.100268</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Hajjo</surname><given-names>R</given-names> </name></person-group><article-title>AI-models [github repository]</article-title><source>GitHub</source><access-date>2026-01-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/rhajjo/AI-Models">https://github.com/rhajjo/AI-Models</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Frequently asked questions about breast cancer and answers generated by artificial intelligence models (ChatGPT and DeepSeek) by Researcher 1.</p><media xlink:href="cancer_v12i1e72839_app1.docx" xlink:title="DOCX File, 139 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Frequently asked questions about breast cancer and answers generated by artificial intelligence models (ChatGPT and DeepSeek) by Researcher 2.</p><media xlink:href="cancer_v12i1e72839_app2.docx" xlink:title="DOCX File, 81 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Frequently asked questions about breast cancer and answers generated by artificial intelligence models (ChatGPT and DeepSeek) by Researcher 3.</p><media xlink:href="cancer_v12i1e72839_app3.docx" xlink:title="DOCX File, 119 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Frequently asked questions about breast cancer and expert consensus answers obtained from Ye et al.</p><media xlink:href="cancer_v12i1e72839_app4.xlsx" xlink:title="XLSX File, 15 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Human experts and artificial intelligence reliability scores for references cited by ChatGPT-4.0.</p><media xlink:href="cancer_v12i1e72839_app5.xlsx" xlink:title="XLSX File, 11 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Human experts and artificial intelligence reliability scores for references cited by DeepSeek-V3.</p><media xlink:href="cancer_v12i1e72839_app6.xlsx" xlink:title="XLSX File, 22 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Information sources for ChatGPT-4.0 and DeepSeek-V3.</p><media xlink:href="cancer_v12i1e72839_app7.docx" xlink:title="DOCX File, 109 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Average Flesch-Kincaid Grade Level scores for the single-response analysis and results of the paired <italic>t</italic> test.</p><media xlink:href="cancer_v12i1e72839_app8.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Detailed Flesch-Kincaid Grade Level scores for the single-response analysis.</p><media xlink:href="cancer_v12i1e72839_app9.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Average Flesch-Kincaid Grade Level scores for the multiresponse analysis and results of the paired <italic>t</italic> test.</p><media xlink:href="cancer_v12i1e72839_app10.xlsx" xlink:title="XLSX File, 11 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Detailed Flesch-Kincaid Grade Level scores for the multiresponse analysis.</p><media xlink:href="cancer_v12i1e72839_app11.xlsx" xlink:title="XLSX File, 11 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Likert scores for the single-instance analysis.</p><media xlink:href="cancer_v12i1e72839_app12.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app13"><label>Multimedia Appendix 13</label><p>Likert scores for the multiresponse analysis.</p><media xlink:href="cancer_v12i1e72839_app13.xlsx" xlink:title="XLSX File, 15 KB"/></supplementary-material><supplementary-material id="app14"><label>Multimedia Appendix 14</label><p>Interrater evaluation of ChatGPT-4.0 references for breast cancer information based on ratings by 3 human experts and alignment with artificial intelligence models.</p><media xlink:href="cancer_v12i1e72839_app14.xlsx" xlink:title="XLSX File, 15 KB"/></supplementary-material><supplementary-material id="app15"><label>Multimedia Appendix 15</label><p>Interrater evaluation of DeepSeek-V3 references for breast cancer information based on ratings by three human experts and alignment with artificial intelligence models.</p><media xlink:href="cancer_v12i1e72839_app15.xlsx" xlink:title="XLSX File, 30 KB"/></supplementary-material></app-group></back></article>