<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JC</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Cancer</journal-id>
      <journal-title>JMIR Cancer</journal-title>
      <issn pub-type="epub">2369-1999</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v12i1e86630</article-id>
      <article-id pub-id-type="pmid">41729569</article-id>
      <article-id pub-id-type="doi">10.2196/86630</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluation of GPT-5 for Esophageal Cancer Staging Using Fluorodeoxyglucose Positron Emission Tomography Maximum-Intensity Projection Images: Comparative Pilot Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Balcarras</surname>
            <given-names>Matthew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Al-Adhami</surname>
            <given-names>Dhuha</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mosca</surname>
            <given-names>Lucia</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Maruyama</surname>
            <given-names>Hiroki</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-7845-2281</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Toyama</surname>
            <given-names>Yoshitaka</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Imaging and Anatomy for Groundbreaking Education Collaborative Research</institution>
            <institution>Graduate School of Medicine</institution>
            <institution>Tohoku University</institution>
            <addr-line>2-1 Seiryo-Machi, Aoba-Ku</addr-line>
            <addr-line>Sendai, </addr-line>
            <country>Japan</country>
            <phone>81 022 717 7312</phone>
            <fax>81 022 717 7316</fax>
            <email>ytoyama0818@gmail.com</email>
          </address>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0027-9681</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Araki</surname>
            <given-names>Yuya</given-names>
          </name>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-7035-6680</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Takanami</surname>
            <given-names>Kentaro</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0098-7760</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Ito</surname>
            <given-names>Masato</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-7979-999X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Nakajima</surname>
            <given-names>Yumi</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-2948-2357</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Takase</surname>
            <given-names>Kei</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0931-9942</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Kamei</surname>
            <given-names>Takashi</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1282-0463</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Surgery</institution>
        <institution>Graduate School of Medicine</institution>
        <institution>Tohoku University</institution>
        <addr-line>Sendai</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Imaging and Anatomy for Groundbreaking Education Collaborative Research</institution>
        <institution>Graduate School of Medicine</institution>
        <institution>Tohoku University</institution>
        <addr-line>Sendai</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Diagnostic Radiology</institution>
        <institution>Tohoku University Hospital</institution>
        <addr-line>Sendai</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>School of Medicine</institution>
        <institution>Tohoku University</institution>
        <addr-line>Sendai</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Diagnostic Radiology</institution>
        <institution>Osaki Citizen Hospital</institution>
        <addr-line>Osaki</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Diagnostic Radiology</institution>
        <institution>Tohoku Medical and Pharmaceutical University</institution>
        <addr-line>Sendai</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yoshitaka Toyama <email>ytoyama0818@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>23</day>
        <month>2</month>
        <year>2026</year>
      </pub-date>
      <volume>12</volume>
      <elocation-id>e86630</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>10</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>3</day>
          <month>12</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>30</day>
          <month>1</month>
          <year>2026</year>
        </date>
        <date date-type="accepted">
          <day>30</day>
          <month>1</month>
          <year>2026</year>
        </date>
      </history>
      <copyright-statement>©Hiroki Maruyama, Yoshitaka Toyama, Yuya Araki, Kentaro Takanami, Masato Ito, Yumi Nakajima, Kei Takase, Takashi Kamei. Originally published in JMIR Cancer (https://cancer.jmir.org), 23.02.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cancer, is properly cited. The complete bibliographic information, a link to the original publication on https://cancer.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://cancer.jmir.org/2026/1/e86630" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Accurate esophageal cancer staging relies on <sup>18</sup>F fluorodeoxyglucose positron emission tomography (<sup>18</sup>F FDG-PET), but its interpretation is complex and time-intensive. This diagnostic burden is exacerbated by significant workforce shortages in both radiology and surgery, thus necessitating automated support systems. The emergence of advanced large language models (LLMs) has raised expectations for their potential to fulfill this role in complex medical tasks.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We evaluated the diagnostic accuracy of LLMs for staging esophageal cancer using <sup>18</sup>F FDG-PET images, with a focus on their ability to assess lymph nodes (LNs; clinical N [cN]) and distant metastases (clinical M [cM]) for automated radiology reporting.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This retrospective study included 120 consecutive adult patients who were diagnosed with esophageal squamous cell carcinoma and underwent <sup>18</sup>F FDG-PET/computed tomography at Tohoku University Hospital between January 2019 and December 2021. Patients with prior treatment, nonsquamous cell carcinoma histology, or blood glucose levels ≥200 mg/dL were excluded. Frontal maximum-intensity projection positron emission tomography images were extracted, standardized, and analyzed along with information regarding the tumor location. Six LLMs (GPT-5, GPT-4.5, GPT-4.1, OpenAI-o3, -o1, and GPT-4 Turbo) and 4 blinded human evaluators (a nuclear medicine specialist, a gastrointestinal surgeon, and 2 radiology residents) assessed the presence of thoracic and abdominal LN metastases on a region-level basis and determined cN and cM staging on a patient-level basis. The model analyses were performed using the application programming interface in a zero-shot setting. Radiology reports served as the reference standard. Diagnostic agreement and accuracy were evaluated using Cohen κ and the Cochran Q test. Additionally, to account for the class imbalance in the dataset, the Matthews Correlation Coefficient was calculated as a robust metric for binary classification performance. Post hoc McNemar tests were performed with Bonferroni correction; statistical significance for pairwise comparisons was set at <italic>P</italic>&#60;.0083 (adjusted from <italic>P</italic>&#60;.05) using JMP Pro (version 18.0; SAS Institute Inc).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The average accuracy was 41/120 (34%) to 94/120 (78%) for LLMs and 72/120 (60%) to 102/120 (85%) for physicians, with significantly higher accuracy for physicians (<italic>P</italic>&#60;.05) in the thoracic LN, abdominal LN, and cN stages. Interrater reliability was slight to fair for LLMs (κ: –0.07 to 0.25) and fair to substantial for physicians (κ: 0.27 to 0.74). Matthews Correlation Coefficient scores were consistently higher for physicians (0.28 to 0.75) than for LLMs (–0.07 to 0.32). Among the LLMs, GPT-5 demonstrated the highest overall accuracy, with newer LLMs showing improved diagnostic accuracy when compared with previous models in identifying abdominal LN metastases and cM staging, though they showed weaker consistency for cN staging. For example, in thoracic LN detection, GPT-5 achieved 76/120 (63%) accuracy, whereas other LLMs achieved 72/120 (60%) or lower accuracy.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Although current LLMs have not yet reached physician-level accuracy in comprehensive staging, recent models show promise in assisting with specific diagnostic tasks.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>generative artificial intelligence</kwd>
        <kwd>large language models</kwd>
        <kwd>LLMs</kwd>
        <kwd>18F FDG-PET imaging</kwd>
        <kwd>fluorodeoxyglucose positron emission tomography</kwd>
        <kwd>esophageal cancer staging</kwd>
        <kwd>radiology report automation</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Esophageal cancer remains one of the most challenging malignancies to manage. According to the most recent GLOBOCAN statistics and Japanese cancer registry data, it poses a significant global health burden [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Management requires multidisciplinary expertise that spans complex surgical procedures, perioperative care, and advanced imaging interpretation. Esophagectomy is among the most invasive oncologic surgeries, and optimal patient outcomes depend on accurate staging, meticulous operative planning, and coordinated care.</p>
      <p>Surgical services face a mounting workforce shortage: the Association of American Medical Colleges 2024 national projection estimates a shortfall of 10,100 to 19,900 surgeons by 2036 [<xref ref-type="bibr" rid="ref3">3</xref>], and a nationwide Japanese survey reported that over half of teaching hospitals already experience surgeon shortages—even in densely populated prefectures [<xref ref-type="bibr" rid="ref4">4</xref>]. In parallel, radiology faces both workforce shortages and escalating workload: the 2023 Workforce Census for the United Kingdom reports a 30% shortfall of clinical radiologists [<xref ref-type="bibr" rid="ref5">5</xref>], while the volume of image data per study has surged markedly, compounding reporting demands [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <p>Against this backdrop, fluorodeoxyglucose positron emission tomography/computed tomography (<sup>18</sup>F FDG-PET/CT)—a cornerstone of preoperative staging in esophageal cancer—is notably time-consuming and complex to interpret [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], requiring integration of functional and anatomical information. This adds to the workload of both surgeons, who must incorporate imaging findings into surgical planning, and radiologists, who must provide comprehensive and timely reports for multidisciplinary decision-making.</p>
      <p>International guidelines, including the American College of Radiology Appropriateness Criteria and the European Society for Medical Oncology recommendations, endorse <sup>18</sup>F FDG-PET/CT for baseline staging and selected follow-up in esophageal cancer [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. <sup>18</sup>F FDG-PET/CT is recognized not only for its diagnostic utility in detecting distant metastases and assessing nodal involvement but also for its significant prognostic value in oncology, as metabolic parameters often correlate with patient outcomes [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      <p>Generative artificial intelligence (AI), a subset of AI capable of creating new content, has revolutionized various fields. Within this domain, large language models (LLMs) are deep learning algorithms trained on massive datasets to understand and generate human-like text. Recently, the evolution of these models into multimodal large language models, which can process and interpret both text and images simultaneously, has expanded their potential applications in health care. While previous research has demonstrated the utility of AI in medical tasks such as summarizing radiology reports or passing medical licensing examinations [<xref ref-type="bibr" rid="ref14">14</xref>], the application of general-purpose multimodal large language models to complex image interpretation remains limited. Most prior studies have focused on anatomical imaging modalities like plain radiography or CT for simple classification tasks [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. There is a paucity of research evaluating whether these models can perform high-level clinical reasoning, specifically TNM staging based on functional nuclear medicine imaging (<sup>18</sup>F FDG-PET). Against this backdrop, and with the release of GPT-5, the latest publicly available LLM from OpenAI, we report the first evaluation of the medical image interpretation capabilities of this model. We compared its diagnostic accuracy in esophageal cancer staging with that of physicians and other state-of-the-art models.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>We adhered to the guidelines outlined in the checklist for AI in medical imaging to ensure methodological transparency and ethical rigor [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This retrospective study conformed to the ethical standards of the Declaration of Helsinki (1975, as revised in 2013) and was approved by the Institutional Review Board of Tohoku University Hospital (approval number 2024-1-816). The Institutional Review Board explicitly approved the transfer of deidentified patient image data to the third-party commercial servers used by the application programming interfaces (APIs). The requirement for individual informed consent was waived, and patients were informed regarding the study via an opt-out method on the hospital website. To protect patient privacy and confidentiality, all data used for analysis were anonymized; the correspondence table linking study IDs to personal information was stored separately in a secure location restricted to authorized personnel. Prior to uploading, all images were fully deidentified by removing all DICOM metadata and converting them to JPEG format. As this study involved the secondary use of existing data, no compensation was provided to the participants. Furthermore, all images included in the manuscript and supplementary materials were carefully cropped to remove any personally identifiable information, such as patient ID, name, age, and sex, to ensure that individual participants cannot be identified.</p>
      </sec>
      <sec>
        <title>Patients</title>
        <p>The cases analyzed in this study were derived from a prospective, continuously registered cohort of patients who were hospitalized and treated at our institution between January 2019 and December 2021. All participants underwent upper gastrointestinal endoscopy and were diagnosed with esophageal cancer that was confirmed via biopsy. The patients were eligible for inclusion if they were 18 years of age or older, had undergone their first positron emission tomography (PET)/CT examination using a GE scanner at our hospital, and had a biopsy-confirmed diagnosis of squamous cell carcinoma (SCC). Patients were excluded if they had a history of treatment for esophageal cancer, a histological type other than SCC, such as adenocarcinoma, or a pre-examination blood glucose level of ≥200 mg/dL. The overall study design is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Research workflow for comparing large language models with physicians in the staging of esophageal cancer. This flowchart illustrates the study design for evaluating the performance of GPT-5, GPT-4.5, GPT-4.1, OpenAI-o3, OpenAI-o1, and GPT-4turbo in comparison with physicians using MIP images from 18F FDG-PET. The workflow includes key steps, such as acquiring MIP frontal images, cropping regions of interest, analyzing these images and tumor location data using large language models, and assessing their performance relative to that of physicians. FDG-PET: fluorodeoxyglucose positron emission tomography; MIP: maximum intensity projection.</p>
          </caption>
          <graphic xlink:href="cancer_v12i1e86630_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>PET Imaging and Interpretation</title>
        <p><sup>18</sup>F FDG-PET/CT was performed at our institution by using a silicon photomultiplier PET scanner (Discovery MI; GE Healthcare). Patients were instructed to fast for at least 6 h prior to the <sup>18</sup>F FDG injection. A rapid intravenous injection of <sup>18</sup>F FDG, at a dose of approximately 3.7 MBq/kg, was administered through the right antecubital vein. After a 60-minute uptake period, patients underwent a low-dose CT scan (140 kV; automatic exposure control, 20-80 mA), followed by whole-body PET/CT imaging.</p>
        <p>All PET/CT images were interpreted by board-certified radiologists who had specialized in both radiology and nuclear medicine and had been accredited by the Japanese Society of Radiology and the Japanese Society of Nuclear Medicine. Interpretations were made in the context of available clinical information and correlative imaging studies, such as contrast-enhanced CT. Their interpretation reports served as the gold standard for this study, which evaluated whether LLMs can generate radiology reports. The interpretation reports were documented in accordance with the 8th edition of the Union for International Cancer Control staging system, and ensured that the N and M classifications, in addition to the T classification, were defined [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      </sec>
      <sec>
        <title>Image Selection and Data Preparation</title>
        <p>As a single maximum intensity projection (MIP) image extracted from a PET scan provides information about the whole body, it has been suggested that AI diagnosis could reduce the data load [<xref ref-type="bibr" rid="ref21">21</xref>]. Therefore, we chose to analyze the MIP, as it offers a comprehensive view of metabolic activity across the entire body while simplifying data processing and interpretation. MIP frontal images were extracted from the acquired DICOM-format PET/CT images and subsequently converted to the JPEG format. During this process, the regions from the neck up and pelvis down were cropped to exclude common physiological accumulations in the oral cavity and bladder. The decision to crop regions distal to the pelvis was further supported by the finding that no bone metastases were identified in these areas within our dataset. The original images (563 × 710 pixels) were cropped to a fixed size of 270 × 250 pixels for standardization.</p>
        <p>Data management and entry were performed using Microsoft Excel (Microsoft Corp). Based on the radiological interpretation reports (reference standard), the following patient variables were collected: age, sex, primary tumor location, presence of thoracic lymph node (LN) metastasis, presence of abdominal LN metastasis, clinical N stage (cN), clinical M stage (cM), clinical stage, and treatment modality.</p>
        <p>The analysis was conducted at the patient level for determining the cN and cM stages and at the region level for assessing the presence of metastasis in the thoracic and abdominal fields. This unit of analysis was selected to align with clinical decision-making processes, where the overall stage and regional involvement dictate the treatment strategy, rather than the precise counting of individual LNs. All staging was performed in accordance with the 8th edition of the Union for International Cancer Control TNM classification.</p>
        <p>The location of the primary tumor, which was used as an input for GPT, was determined by a specialist in gastrointestinal surgery. This classification was based on a comprehensive review of upper gastrointestinal endoscopy, fluoroscopy, and CT images based on the guidelines outlined in the Japanese Classification of Esophageal Cancer, 12th Edition [<xref ref-type="bibr" rid="ref22">22</xref>]. The cervical and upper thoracic esophagus were classified as the upper region, the middle thoracic esophagus as the middle region, and the lower thoracic esophagus and esophagogastric junction as the lower region, to generate a 3-tier classification.</p>
      </sec>
      <sec>
        <title>LLM-Based Analysis</title>
        <p>The selection of LLMs for this study was restricted to the GPT series developed by OpenAI, as these models are currently the most widely used generative AI platforms globally and provide a robust API that facilitates seamless multimodal data input. In this study, 6 LLMs were used for analysis: GPT-5, GPT-4.5, GPT-4.1, OpenAI-o3, OpenAI-o1, and GPT-4 Turbo (OpenAI). GPT-4o was excluded as its API systematically returned a content policy violation when prompted with medical images, precluding its inclusion in the analysis [<xref ref-type="bibr" rid="ref23">23</xref>]. All features used in the analysis are available in the paid version (Plus). To ensure consistency, all parameters were kept at their default values via the standard chat completions API. Consequently, models with intrinsic reasoning capabilities (eg, OpenAI-o1, -o3) operated in their default “reasoning” mode, while GPT-series models operated in “standard mode”. No custom instructions or pretraining were used. A zero-shot approach was used for all tasks. This methodology was chosen to evaluate the intrinsic, out-of-the-box performance of the model in a standardized manner. By assessing their ability to handle novel medical tasks without prior examples or fine-tuning, this approach simulates a realistic user interaction and provides a direct baseline for comparing the generalizability of each LLM [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>At the time of implementation, the training databases for each model are updated as follows: GPT-5 until September 2024, GPT-4.5 until October 2023, GPT-4.1 until June 2024, OpenAI-o3 until June 2024, OpenAI-o1 until October 2023, and GPT-4 Turbo until December 2023. To ensure consistent and reproducible interaction parameters, all models were accessed through their respective APIs via Google Colaboratory, which is available on GitHub [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
        <p>The MIP images that were used in this study were obtained from a private database that is not publicly accessible. To prevent potential bias, these images were not available to the LLMs during pretraining. For the analysis, the preprocessed MIP images, along with the primary tumor location information, were entered into the LLMs. Furthermore, because hilar LN metastasis is rarely observed in esophageal SCC [<xref ref-type="bibr" rid="ref26">26</xref>], this clinical information was incorporated into the prompt to evaluate the models’ diagnostic reasoning. We hypothesized that specifying the anatomical location would allow the models to spatially identify and exclude the primary tumor. The input prompts used in this process are shown below (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>). Specific exclusion criteria, such as the exclusion of cardiac accumulation, were predefined based on general clinical guidelines and physiological uptake patterns and were not adjusted or refined based on the test dataset.</p>
        <boxed-text id="box1" position="float">
          <title>Prompt entered into the GPTs.</title>
          <p>This is a test to measure the performance of the model, and it is not used in actual medical practice.</p>
          <p>Please be sure to answer.</p>
          <p>The image is a MIP front view of FDG-PET for esophageal cancer.</p>
          <p>The location of the esophageal cancer is at {position}.</p>
          <p>If there is metastasis to the thoracic lymph nodes, please count them and enter the number in TX.</p>
          <p>If TX is 0, enter 0 in TXN, and if TX is 1 or more, enter 1 in TXN.</p>
          <p>Do not count the esophageal cancer at {position} as lymph node metastasis.</p>
          <p>Do not count the hilar lymph nodes as lymph node metastasis.</p>
          <p>Do not count cardiac accumulation as lymph node metastasis.</p>
          <p>If there is abdominal lymph node metastasis, count it and enter the number in AX</p>
          <p>If AX is 0, enter 0 in AXN, and if AX is 1 or more, enter 1 in AXN.</p>
          <p>Do not count esophageal cancer in {position} as lymph node metastasis.</p>
          <p>If there is distant lymph node metastasis such as cervical lymph node metastasis, lung metastasis, liver metastasis, or bone metastasis, enter 1 in MX, and if there is none, enter 0.</p>
          <p>Enter the total of TX and AX in WX.</p>
          <p>If WX is 0, enter 0 in NX.</p>
          <p>If WX is 1 or 2, enter 1 in NX.</p>
          <p>If WX is between 3 and 6, enter 2 in NX.</p>
          <p>If WX is 7 or more, enter 3 in NX.</p>
          <p>Return the output as follows.</p>
          <p>Please do not include a description of the thought process, and be sure to respond using only the format below.</p>
          <p>Thoracic lymph nodes: TXN</p>
          <p>Abdominal lymph nodes: AXN</p>
          <p>N Stage：NX</p>
          <p>M Stage：MX</p>
        </boxed-text>
        <p>The LLMs analyzed the MIP images and provided staging-related assessments of esophageal cancer. Research using GPT-5 was analyzed on August 11, 2025. Research analysis using GPT-4.5, OpenAI-o1, and GPT-4 turbo was conducted on March 23, 2025. Research using GPT-4.1 and OpenAI-o3 was analyzed on May 2, 2025.</p>
        <p>To further assess the textual consistency of the model’s outputs and address the “black box” limitation, we performed a post hoc qualitative subanalysis on the 3 representative cases (shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>) on December 9, 2025. For this analysis, the prompt was modified to include the following instruction: “Please state the basis for reaching that diagnosis.” This enabled us to examine whether the model’s generated explanation aligned with clinical features, although it does not guarantee that the model visually attended to them.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Examples of input images and responses of GPT-4.5, GPT-4.1, and OpenAI-o3 in cases of esophageal cancer. The primary tumor site indicated in the radiology report is shown as a blue circle, and the metastatic LNs are shown as red circles. Note that these colored circles were manually overlaid by the authors to visualize the ground truth and were not generated by the AI models. The yellow cells indicate the correct answers (agreement with the ground truth). (A) All the models correctly identified the absence of LN and distant metastases beyond the primary lesion. (B) A case with a single metastatic thoracic LN. Only GPT-5 and OpenAI-o1 provided a correct evaluation, identifying thoracic LN metastasis, no abdominal LN metastasis, and the correct cN and cM stages. Other models either failed to identify the thoracic LN metastasis or misdiagnosed abdominal LN metastasis as positive. (C) A cN-stage 2 case with thoracic LN metastasis. 18F FDG (fluorodeoxyglucose) accumulation in the hilar LNs was interpreted as nonspecific accumulation in the radiology report. GPT-5 correctly identified the cN stage but misdiagnosed abdominal LN metastasis as positive. Although other models correctly identified thoracic LN metastasis, many incorrectly stated the disease as N1. cM: clinical M; cN: clinical N; LN: lymph nodes.</p>
          </caption>
          <graphic xlink:href="cancer_v12i1e86630_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Physician’s Evaluation</title>
        <p>The same information that was provided to the LLMs, including the cropped MIP images and primary tumor location for esophageal cancer, was presented to 4 human evaluators: a nuclear medicine specialist with 14 years of experience, a gastrointestinal surgeon with 9 years of experience, and 2 radiology residents. To prevent bias, the evaluators were blinded to the contents of the diagnostic report. Using the same criteria that were applied by the LLMs, each evaluator independently assessed the images and determined the presence or absence of thoracic LN metastases, abdominal LN metastases, and cN and cM stages. The evaluators were not involved in the diagnosis or treatment of the included patients. To ensure parity with the AI input (which included tumor location prompts), evaluators were provided with the tumor location information but were strictly blinded to all other clinical data, including patient history, reference radiology reports, and pathological outcomes.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>The primary outcome was diagnostic accuracy, defined as the concordance with the reference standard. The secondary outcome was interrater reliability assessed using Cohen κ. The CIs for each rater’s diagnostic performance were calculated using the Wilson score interval (without continuity correction). Cohen κ consistency analysis was used to assess the agreement between the LLMs, physicians, and actual diagnostic reports. Additionally, for binary classification tasks (assessment of thoracic LN metastasis, abdominal LN metastasis, and cM stage), the Matthews Correlation Coefficient (MCC) was calculated. MCC is considered a robust metric for imbalanced datasets, as it incorporates true and false positives and negatives, returning a high score only when the prediction performs well across all confusion matrix categories [<xref ref-type="bibr" rid="ref27">27</xref>]. The κ values were interpreted according to the following scale: 0-0.2 (poor agreement), 0.2-0.4 (fair agreement), 0.4-0.6 (moderate agreement), 0.6-0.8 (substantial agreement), and 0.8-1.0 (almost perfect agreement) [<xref ref-type="bibr" rid="ref28">28</xref>]. Student <italic>t</italic> test and Cochran Q test were used to compare the rates of diagnostic accuracy between LLMs and physicians, followed by the post hoc McNemar test [<xref ref-type="bibr" rid="ref29">29</xref>]. Data were analyzed using JMP Pro (version 18.0; SAS Institute Inc). Only the post hoc McNemar test was corrected using the Holm-Bonferroni correction to adjust the <italic>P</italic> value &#60;.0083; a <italic>P</italic> value &#60;.05 was considered statistically significant for all analyses.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Baseline Characteristics of the Study Population</title>
        <p>Of the 311 patients with esophageal cancer who were admitted to our department, 36 were excluded because of a histological type other than SCC, 139 had already received treatment or were not undergoing their first PET/CT scan, and 16 had other carcinomas. Thus, 120 patients were included in this study (<xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Case inclusion and exclusion flowchart. PET: positron emission tomography.</p>
          </caption>
          <graphic xlink:href="cancer_v12i1e86630_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Within this cohort, 120 primary esophageal cancer lesions were identified and analyzed. Histopathologically, all cases (100%) were confirmed as SCC. The study population comprised 120 patients (median age 71 years; 25/120 women, 20.8%); 58/120 (48.3%) patients had thoracic LN metastasis, and 35/120 (29.2%) had abdominal LN metastasis. The cN stage was N0 in 45/120 (37.5%) patients, N1 in 52/120 (43.3%) patients, N2 in 22/120 (18.3%) patients, and N3 in 1/120 (0.8%) patients. The cM stage was M1 in 27/120 (22.5%) patients. The detailed data are presented in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Patient characteristics. All patients included had squamous cell carcinoma.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="600"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Characteristics</td>
                <td>Values (n=120)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">Age (years), median (range)</td>
                <td>71 (44-89)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Sex, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td>95 (79.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td>25 (20.8)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Location, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>CeUt<sup>ab</sup></td>
                <td>16 (13.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mt<sup>c</sup></td>
                <td>61 (50.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LtJz<sup>de</sup></td>
                <td>43 (35.8)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Location of LN<sup>f</sup> metastasis, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Thoracic LN</td>
                <td>58 (48.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Abdominal LN</td>
                <td>35 (29.2)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>cN-stage<sup>g</sup>, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>N0</td>
                <td>45 (37.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>N1</td>
                <td>52 (43.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>N2</td>
                <td>22 (18.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>N3</td>
                <td>1 (0.83)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>cM-stage<sup>h</sup> n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>M0</td>
                <td>93 (77.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>M1</td>
                <td>27 (22.5)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>cStage<sup>i</sup>, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>I</td>
                <td>17 (15.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>II/III</td>
                <td>63 (52.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>IV</td>
                <td>40 (33.3)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Treatment, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Operation</td>
                <td>80 (66.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>chemotherapy/radiation therapy</td>
                <td>36 (30)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BSC<sup>j</sup></td>
                <td> 4 (3.3)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Ce: cervical esophagus.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Ut: upper thoracic esophagus.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Mt: middle thoracic esophagus.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Lt: lower thoracic esophagus.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>Jz: zone of the esophagogastric junction.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>LN: lymph node.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>cN: clinical N.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>cM: clinical M.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>cStage: clinical stage.</p>
            </fn>
            <fn id="table1fn10">
              <p><sup>j</sup>BSC: best supportive care.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Examples of MIP Images and LLM Responses</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> presents representative MIP images that were entered into GPT-5, GPT-4.5, GPT-4.1, and OpenAI-o3, along with their corresponding diagnostic outputs. All patients had middle thoracic esophageal cancer.</p>
        <p>Case A involved a patient without LN or distant metastases outside the primary lesion. All 3 LLMs correctly identified the absence of LN and distant metastases. In Case B, which featured a patient with a single metastatic thoracic LN, only GPT-5 and OpenAI-o1 provided a fully correct evaluation. These models accurately identified the thoracic LN metastasis, correctly reported the absence of abdominal LN metastasis, and determined the proper cN and cM stages. The other models either failed to detect the thoracic metastasis or incorrectly identified abdominal LN involvement. Case C presented a patient with cN-stage 2 thoracic LN metastasis. In this instance, GPT-5 correctly identified the cN stage but misdiagnosed abdominal LN metastasis as positive. While the other models correctly detected the presence of thoracic LN metastasis, they failed to determine the correct stage, with many classifying the disease as N1.</p>
      </sec>
      <sec>
        <title>Qualitative Assessment of Generated Rationale</title>
        <p>The results of the reasoning verification subanalysis conducted for the 3 cases shown in <xref rid="figure2" ref-type="fig">Figure 2</xref> are summarized in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>In Case A, GPT-5 provided a correct diagnosis with a rationale that explicitly mentioned the exclusion of cardiac and hilar uptake, consistent with the instructions.</p>
        <p>In Case B, although GPT-5 had correctly identified the thoracic LN metastasis in the primary analysis, the model failed to detect the lesion in the subanalysis (False Negative), stating “No additional discrete FDG-avid mediastinal nodal foci.” The text output suggested that the model actively evaluated and excluded the hilar region; however, this inconsistency highlights the stochastic nature of LLMs, where minor prompt alterations (eg, adding a request for reasoning) can alter the diagnostic outcome.</p>
        <p>In Case C, the model correctly identified the N2 stage, but the reasoning revealed a discrepancy. It correctly excluded hilar uptake in its explanation, but hallucinated an abdominal LN metastasis (False Positive). This suggests that although the model can generate text that appears to apply exclusion criteria, it may still misidentify physiological uptake or noise as pathological lesions, thereby reaching the correct stage for the wrong anatomical reason.</p>
      </sec>
      <sec>
        <title>Overall Diagnostic Performance of GPTs and Physicians</title>
        <p>The correct response rate, sensitivity, specificity, and Cohen κ coefficient for each parameter are presented for both the LLMs and physicians. The overall diagnostic performance is summarized in <xref ref-type="table" rid="table2">Tables 2</xref>-<xref ref-type="table" rid="table5">5</xref>. In the overall correct response rate, LLMs achieved a rate of 41/120 (34%; 95% CI 26%-43%) to 94/120 (78%; 95% CI 70%-86%), whereas physicians demonstrated a higher rate of 70/120 (58%; 95% CI 49%-67%) to 108/120 (90%; 95% CI 83%-94%). The correct response rate of LLMs for the thoracic and abdominal LN ranged from 60/120 (50%; 95% CI 40%-59%) to 87/120 (73%; 95% CI 64%-80%). The sensitivity of LLMs ranged from 7/120 (6%; 95% CI 0%-14%) to 112/120 (93%; 95% CI 86%-100%), whereas that of physicians ranged from 65/120 (54%; 95% CI 37%-72%) to 104/120 (87%; 95% CI 77%-94%). The specificity was 12/120 (10%; 95% CI 2%-17%) to 115/120 (96%; 95% CI 90%-99%) for LLMs and 87/120 (73%; 95% CI 61%-84%) to 119/120 (99%; 95% CI 94%-100%) for physicians. For the cN stage, the correct response rate was 41/120 (34%; 95% CI 26%-43%) to 58/120 (48%; 95% CI 40%-57%) for LLMs and 70/120 (58%; 95% CI 49%-67%) to 73/120 (61%; 95% CI 52%-70%) for physicians. For the cM stage, the correct response rate ranged from 91/120 (76%; 95% CI 68%-84%) to 102/120 (85%; 95% CI 79%-92%) for both LLMs and physicians. The sensitivity was 0/120 (0%; 95% CI 0%-0%) to 18/120 (15%; 95% CI 0.5%-29%) for LLMs and 40/120 (33%; 95% CI 14%-52%) to 67/120 (56%; 95% CI 37%-72%) for physicians. The specificity for both groups was 100/120 (83%; 95% CI 75%-91%) to 119/120 (99%; 95% CI 97%-100%). In terms of MCC, which adjusts for class imbalance, physicians consistently outperformed LLMs. For example, in the assessment of thoracic LN metastasis, the radiologist achieved an MCC of 0.573, whereas the highest-performing LLM (GPT-5) reached only 0.317.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Overall diagnostic performance of GPTs and physicians for thoracic lymph nodes.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="150"/>
            <col width="160"/>
            <col width="160"/>
            <col width="130"/>
            <col width="70"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy (%) (95% CI)</td>
                <td>Sensitivity (%) (95% CI)</td>
                <td>Specificity (%) (95% CI)</td>
                <td>Cohen κ value</td>
                <td><italic>P</italic> value</td>
                <td>Matthews correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-5</td>
                <td>63 (54-71)</td>
                <td>31 (21-44)</td>
                <td>94 (84-97)</td>
                <td>0.25</td>
                <td>&#60;.01</td>
                <td>0.32</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.5</td>
                <td>60 (51-68)</td>
                <td>35 (22-47)</td>
                <td>84 (75-93)</td>
                <td>0.19</td>
                <td>&#60;.01</td>
                <td>0.21</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.1</td>
                <td>58 (49-66)</td>
                <td>28 (18-40)</td>
                <td>85 (75-92)</td>
                <td>0.13</td>
                <td>&#60;.01</td>
                <td>0.16</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o3</td>
                <td>56 (47-64)</td>
                <td>22 (14-35)</td>
                <td>87 (77-93)</td>
                <td>0.097</td>
                <td>—<sup>a</sup></td>
                <td>0.13</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o1</td>
                <td>50 (40-59)</td>
                <td>93 (86-100)</td>
                <td>10 (2-17)</td>
                <td>0.027</td>
                <td>—</td>
                <td>0.05</td>
              </tr>
              <tr valign="top">
                <td>GPT-turbo</td>
                <td>52 (43-61)</td>
                <td>20 (9-31)</td>
                <td>82 (73-92)</td>
                <td>0.03</td>
                <td>—</td>
                <td>0.04</td>
              </tr>
              <tr valign="top">
                <td>Radiologist</td>
                <td>78 (71-86)</td>
                <td>84 (75-94)</td>
                <td>73 (61-84)</td>
                <td>0.57</td>
                <td>&#60;.001</td>
                <td>0.57</td>
              </tr>
              <tr valign="top">
                <td>Surgeon</td>
                <td>74 (66-82)</td>
                <td>60 (47-73)</td>
                <td>87 (79-96)</td>
                <td>0.48</td>
                <td>&#60;.001</td>
                <td>0.49</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 1</td>
                <td>74 (66-82)</td>
                <td>71 (59-83)</td>
                <td>77 (67-88)</td>
                <td>0.48</td>
                <td>&#60;.001</td>
                <td>0.48</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 2</td>
                <td>80 (72-96)</td>
                <td>87 (77-94)</td>
                <td>73 (62-83)</td>
                <td>0.60</td>
                <td>&#60;.001</td>
                <td>0.59</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Overall diagnostic performance of GPTs and physicians for abdominal lymph nodes.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="150"/>
            <col width="160"/>
            <col width="160"/>
            <col width="130"/>
            <col width="70"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy (%) (95% CI)</td>
                <td>Sensitivity (%) (95% CI)</td>
                <td>Specificity (%) (95% CI)</td>
                <td>Cohen κ value</td>
                <td><italic>P</italic> value</td>
                <td>Matthews correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-5</td>
                <td>73 (64-80)</td>
                <td>14 (6-29)</td>
                <td>96 (90-99)</td>
                <td>0.14</td>
                <td>&#60;.01</td>
                <td>0.20</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.5</td>
                <td>69 (61-77)</td>
                <td>34 (18-51)</td>
                <td>82 (74-91)</td>
                <td>0.18</td>
                <td>&#60;.01</td>
                <td>0.18</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.1</td>
                <td>71 (62-78)</td>
                <td>17 (8-33)</td>
                <td>93 (85-97)</td>
                <td>0.13</td>
                <td>&#60;.01</td>
                <td>0.15</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o3</td>
                <td>71 (62-78)</td>
                <td>9 (3-22)</td>
                <td>96 (90-99)</td>
                <td>0.067</td>
                <td>—<sup>a</sup></td>
                <td>0.11</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o1</td>
                <td>56 (47-65)</td>
                <td>63 (46-80)</td>
                <td>53 (42-64)</td>
                <td>0.081</td>
                <td>—</td>
                <td>0.14</td>
              </tr>
              <tr valign="top">
                <td>GPT-turbo</td>
                <td>66 (57-74)</td>
                <td>6 (0-14)</td>
                <td>91 (84-97)</td>
                <td>0.063</td>
                <td>—</td>
                <td>–0.06</td>
              </tr>
              <tr valign="top">
                <td>Radiologist</td>
                <td>80 (73-87)</td>
                <td>57 (40-74)</td>
                <td>88 (81-95)</td>
                <td>0.47</td>
                <td>&#60;.001</td>
                <td>0.48</td>
              </tr>
              <tr valign="top">
                <td>Surgeon</td>
                <td>82 (75-89)</td>
                <td>54 (37-72)</td>
                <td>93 (87-99)</td>
                <td>0.52</td>
                <td>&#60;.001</td>
                <td>0.53</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 1</td>
                <td>88 (82-94)</td>
                <td>66 (49-82)</td>
                <td>97 (93-100)</td>
                <td>0.67</td>
                <td>&#60;.001</td>
                <td>0.69</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 2</td>
                <td>90 (83-94)</td>
                <td>69 (52-81)</td>
                <td>99 (94-100)</td>
                <td>0.74</td>
                <td>&#60;.001</td>
                <td>0.75</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Overall diagnostic performance of GPTs and physicians for clinical N-stage (cN-stage).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="280"/>
            <col width="330"/>
            <col width="260"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy (%) (95% CI)</td>
                <td>Cohen κ value</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-5</td>
                <td>48 (40-57)</td>
                <td>0.18</td>
                <td>&#60;.01</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.5</td>
                <td>43 (34-52)</td>
                <td>0.051</td>
                <td>—<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>GPT-4.1</td>
                <td>45 (36-54)</td>
                <td>0.12</td>
                <td>&#60;.01</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o3</td>
                <td>39 (31-48)</td>
                <td>0.043</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o1</td>
                <td>34 (26-43)</td>
                <td>0.055</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>GPT-turbo</td>
                <td>34 (26-43)</td>
                <td>–0.072</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Radiologist</td>
                <td>58 (49-67)</td>
                <td>0.38</td>
                <td>&#60;.01</td>
              </tr>
              <tr valign="top">
                <td>Surgeon</td>
                <td>61 (52-70)</td>
                <td>0.34</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 1</td>
                <td>61 (52-70)</td>
                <td>0.39</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 2</td>
                <td>61 (52-69)</td>
                <td>0.39</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Overall diagnostic performance of GPTs and physicians for clinical M-stage (cM-stage).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="150"/>
            <col width="160"/>
            <col width="160"/>
            <col width="130"/>
            <col width="70"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy (%) (95% CI)</td>
                <td>Sensitivity (%) (95% CI)</td>
                <td>Specificity (%) (95% CI)</td>
                <td>Cohen κ value</td>
                <td><italic>P</italic> value</td>
                <td>Matthews correlation coefficient</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-5</td>
                <td>77 (68-83)</td>
                <td>4 (1-18)</td>
                <td>98 (92-99)</td>
                <td>0.023</td>
                <td>—<sup>a</sup></td>
                <td>0.042</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.5</td>
                <td>76 (68-84)</td>
                <td>0 (0-0)</td>
                <td>98 (95-100)</td>
                <td>–0.032</td>
                <td>—</td>
                <td>–0.07</td>
              </tr>
              <tr valign="top">
                <td>GPT-4.1</td>
                <td>77 (68-83)</td>
                <td>0 (0-0)</td>
                <td>99 (94-100)</td>
                <td>–0.016</td>
                <td>—</td>
                <td>–0.05</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o3</td>
                <td>77 (68-83)</td>
                <td>4 (1-18)</td>
                <td>98 (92-99)</td>
                <td>0.023</td>
                <td>—</td>
                <td>0.04</td>
              </tr>
              <tr valign="top">
                <td>OpenAI-o1</td>
                <td>78 (70-85)</td>
                <td>4 (0-11)</td>
                <td>99 (97-100)</td>
                <td>0.039</td>
                <td>—</td>
                <td>0.09</td>
              </tr>
              <tr valign="top">
                <td>GPT-turbo</td>
                <td>78 (70-86)</td>
                <td>15 (0.5-29)</td>
                <td>96 (92-100)</td>
                <td>0.14</td>
                <td>&#60;.01</td>
                <td>0.18</td>
              </tr>
              <tr valign="top">
                <td>Radiologist</td>
                <td>78 (70-85)</td>
                <td>33 (14-52)</td>
                <td>90 (84-96)</td>
                <td>0.27</td>
                <td>&#60;.001</td>
                <td>0.28</td>
              </tr>
              <tr valign="top">
                <td>Surgeon</td>
                <td>85 (79-92)</td>
                <td>52 (32-72)</td>
                <td>95 (90-99)</td>
                <td>0.52</td>
                <td>&#60;.001</td>
                <td>0.53</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 1</td>
                <td>77 (69-84)</td>
                <td>56 (36-76)</td>
                <td>83 (75-91)</td>
                <td>0.36</td>
                <td>&#60;.001</td>
                <td>0.37</td>
              </tr>
              <tr valign="top">
                <td>Radiology resident 2</td>
                <td>83 (76-89)</td>
                <td>56 (37-72)</td>
                <td>91 (84-96)</td>
                <td>0.50</td>
                <td>&#60;.001</td>
                <td>0.50</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In the analysis of diagnostic correlation, GPT-5 and GPT-4.1 both demonstrated statistically significant weak correlations with the diagnoses of thoracic LN metastasis, abdominal LN metastasis, and cN stage. GPT-4.5 showed a statistically significant weak correlation for the diagnoses of thoracic and abdominal LN metastasis. GPT-4 Turbo showed a statistically significant weak correlation in the diagnosis of the stage. The other models did not demonstrate statistically significant consistency. In contrast, all physicians demonstrated a statistically significant moderate consistency for all items.</p>
      </sec>
      <sec>
        <title>Comparison of Accuracy</title>
        <p>First, we compared the average correct answer rates of LLMs and physicians. In the assessment of thoracic LN metastases, LLMs achieved 68/120 (57%; 95% CI 52%-61%) accuracy, whereas physicians achieved 91/120 (76%; 95% CI 72%-80%) accuracy. In the evaluation of abdominal LNs, LLMs reached an accuracy of 82/120 (68%; 95% CI 62%-73%) compared with 102/120 (85%; 95% CI 78%-91%) for physicians. For cN stage diagnosis, LLMs attained 49/120 (41%; 95% CI 36%-45%) accuracy, whereas physicians achieved 71/120 (60%; 95% CI 55%-64%) accuracy. In the cM-stage assessment, the LLMs achieved 92/120 (77%; 95% CI 75%-80%) accuracy, which was slightly lower than the 96/120 (80%; 95% CI 77%-83%) accuracy observed among the physicians. Overall, physicians demonstrated significantly higher accuracy than LLMs in the evaluation of thoracic LN metastasis, abdominal LN metastasis, and cN stage (<italic>P</italic>&#60;.05; <xref ref-type="table" rid="table6">Table 6</xref>).</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Comparison of average accuracy between large language models (LLMs) and physicians.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Thoracic LN<sup>a</sup> (%) (95% CI)</td>
                <td>Abdominal LN (%) (95% CI)</td>
                <td>cN-stage<sup>b</sup> (%) (95% CI)</td>
                <td>cM-stage<sup>c</sup> (%) (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LLMs (%)</td>
                <td>57 (52-61)</td>
                <td>68 (62-73)</td>
                <td>41 (36-45)</td>
                <td>77 (75-80)</td>
              </tr>
              <tr valign="top">
                <td>Physicians (%)</td>
                <td>76 (72-80)</td>
                <td>85 (78-91)</td>
                <td>60 (55-64)</td>
                <td>80 (77-83)</td>
              </tr>
              <tr valign="top">
                <td><italic>P</italic> value</td>
                <td>&#60;.001</td>
                <td>.002</td>
                <td>&#60;.001</td>
                <td>.052</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>LN: lymph node.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>cN: clinical N.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>cM: clinical M.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Among the LLMs, GPT-5 demonstrated the highest diagnostic accuracy for thoracic LN metastasis, abdominal LN metastasis, and cN stage, and achieved one of the highest accuracies for the cM stage. The results of McNemar’s pairwise test between the LLMs GPT-5, GPT-4.5, GPT-4.1, and OpenAI-o3 and the radiologist are shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>. Comparisons between the LLMs revealed no statistically significant differences across most evaluated parameters; however, a statistically significant difference was observed between GPT-5 and OpenAI-o3 in the diagnosis of thoracic LN metastasis. In the diagnosis of thoracic LN metastasis, all LLMs demonstrated significantly lower accuracy than radiologists. However, for cN stage diagnosis, GPT-5 and GPT-4.1 showed no statistically significant difference from that of the radiologists. Moreover, in the diagnosis of abdominal LN metastasis, no significant differences were observed between any of the LLMs and the radiologists (<italic>P</italic>&#60;.05).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Accuracy of large language models (LLMs) and physicians for each category. Error bars represent 95% CIs for each accuracy. OpenAI-o1 and GPT-4 turbo were excluded from the figure for clarity, as their performance was consistently lower than the other models, as detailed in <xref ref-type="table" rid="table2">Tables 2</xref>-<xref ref-type="table" rid="table5">5</xref>. cN: clinical N; LN: lymph node.</p>
          </caption>
          <graphic xlink:href="cancer_v12i1e86630_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Interrater Reliability</title>
        <p>In the analysis of diagnostic correlation, GPT-5 and GPT-4.1 both demonstrated statistically significant weak correlations with the diagnoses of thoracic LN metastasis, abdominal LN metastasis, and cN stage. GPT-4.5 showed a statistically significant weak correlation for the diagnoses of thoracic and abdominal LN metastasis. GPT-4 Turbo showed a statistically significant weak correlation in the diagnosis of the stage. The other models did not demonstrate statistically significant consistency. In contrast, all physicians demonstrated a statistically significant moderate consistency for all items (<xref ref-type="table" rid="table2">Tables 2</xref>-<xref ref-type="table" rid="table5">5</xref>).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Summary of Results</title>
        <p>To our knowledge, this is the first study to evaluate the newly released GPT-5 for staging esophageal cancer using <sup>18</sup>F FDG-PET images. Our results demonstrate a statistically significant performance gap between that of physicians and current LLMs. While diagnostic accuracy varied across individual models and physicians, the average physician performance was significantly superior to that of the LLM in assessing thoracic LN metastasis (91/120, 76%; 95% CI 72%-80% vs 68/120, 57%; 95% CI 52%-61%; <italic>P</italic>&#60;.001), abdominal LN metastasis (102/120, 85%; 95% CI 78%-91 vs 82/120, 68%; 95% CI 62%-73%; <italic>P</italic>=.002), and cN stage (71/120, 60%; 95% CI 55%-64% vs 49/120, 41%; 95% CI 36%-45%; <italic>P</italic>&#60;.001; <xref ref-type="table" rid="table6">Table 6</xref>). Furthermore, LLM interpretations showed poor consistency (Cohen κ: –0.07 to 0.25), contrasting with the fair-to-substantial agreement observed among physicians (κ: 0.27 to 0.74). These statistical findings confirm that, despite some overlapping accuracy ranges, current general-purpose LLMs reliably underperform compared with human experts in complex staging tasks.</p>
      </sec>
      <sec>
        <title>Principal Findings</title>
        <p>Among the evaluated LLMs, GPT-5 demonstrated the highest diagnostic performance. It achieved accuracies of 76/120 (63%; 95% CI 54%-71%) for thoracic LN and 87/120 (73%; 95% CI 64%-80%) for abdominal LN assessment, numerically outperforming other models like GPT-4.5 (72/120, 60%, 95% CI 51%-68% and 83/120, 69%, 95% CI 61%-77%) and GPT-4.1 (70/120, 58%, 95% CI 49%-66% and 85/120, 71%, 95% CI 62%-78%; <xref ref-type="table" rid="table7">Table 7</xref> GPTs and radiologists). This superior performance underscores the rapid advancement of these models, likely attributable to architectural enhancements and more comprehensive multimodal training [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
        <p>However, a critical analysis of these results reveals a fundamental limitation in using general-purpose generative AI for specialized clinical tasks. While GPT-5 had high specificity (115/120, 96%; 95% CI 90%-99% for abdominal LNs), its sensitivity was critically low (5/35, 14%; 95% CI 6%-29%) when compared with radiologists (20/35, 57%; 95% CI 40%-74%). This indicates that while the model is effective at identifying “normal” findings (True Negatives), it fails to reliably detect pathology (False Negatives). The discrepancy between the high accuracy and relatively low MCC scores observed in the LLMs further confirms that their performance was driven primarily by the correct identification of negative cases (specificity), rather than a balanced detection capability required for clinical staging. This performance profile suggests that general-purpose VLMs, which are primarily trained on natural images and text, currently lack the domain-specific visual calibration required to distinguish subtle metastatic uptake from physiological noise in medical imaging.</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Comparison of the accuracy.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="240"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="200"/>
            <col width="0"/>
            <col width="180"/>
            <col width="0"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Thoracic LN<sup>a</sup> (%) (95% CI)</td>
                <td colspan="2">Abdominal LN (%) (95% CI)</td>
                <td colspan="2">cN-stage<sup>b</sup> (%) (95% CI)</td>
                <td>cM-stage<sup>c</sup> (%) (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>Physicians<sup>d</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Radiologist (%)</td>
                <td colspan="2">78 (71-86)</td>
                <td colspan="2">80 (73-87)</td>
                <td colspan="2">58 (49-67)</td>
                <td colspan="2">78 (70-85)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Surgeon (%)</td>
                <td colspan="2">74 (66-82)</td>
                <td colspan="2">82 (75-89)</td>
                <td colspan="2">61 (52-70)</td>
                <td colspan="2">85 (79-92)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Radiology resident 1 (%)</td>
                <td colspan="2">74 (66-82)</td>
                <td colspan="2">88 (82-94)</td>
                <td colspan="2">61 (52-70)</td>
                <td colspan="2">77 (69-84)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Radiology resident 2 (%)</td>
                <td colspan="2">80 (72-96)</td>
                <td colspan="2">90 (83-94)</td>
                <td colspan="2">61 (52-69)</td>
                <td colspan="2">83 (76-89)</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>GPTs and radiologists</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-5 (%)</td>
                <td colspan="2">63 (54-71)</td>
                <td colspan="2">73 (64-80)</td>
                <td colspan="2">48 (40-57)</td>
                <td colspan="2">77 (68-83)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4.5 (%)</td>
                <td colspan="2">60 (51-68)</td>
                <td colspan="2">69 (61-77)</td>
                <td colspan="2">43 (34-52)</td>
                <td colspan="2">76 (68-84)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4.1 (%)</td>
                <td colspan="2">58 (49-66)</td>
                <td colspan="2">71 (62-78)</td>
                <td colspan="2">45 (36-54)</td>
                <td colspan="2">77 (68-83)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>OpenAI-o3 (%)</td>
                <td colspan="2">56 (47-64)</td>
                <td colspan="2">71 (62-78)</td>
                <td colspan="2">39 (31-48)</td>
                <td colspan="2">77 (68-83)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>OpenAI-o1 (%)</td>
                <td colspan="2">50 (40-59)</td>
                <td colspan="2">56 (47-65)</td>
                <td colspan="2">34 (26-43)</td>
                <td colspan="2">78 (70-85)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-turbo (%)</td>
                <td colspan="2">52 (43-61)</td>
                <td colspan="2">66 (57-74)</td>
                <td colspan="2">34 (26-43)</td>
                <td colspan="2">78 (70-86)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>LN: lymph node.</p>
            </fn>
            <fn id="table7fn2">
              <p><sup>b</sup>cN: clinical N.</p>
            </fn>
            <fn id="table7fn3">
              <p><sup>c</sup>cM: clinical M.</p>
            </fn>
            <fn id="table7fn4">
              <p><sup>d</sup>Thoracic LN: Cochran Q=4 (<italic>P</italic>=.26); Abdominal LN: Cochran Q=12.3 (<italic>P</italic>=.006); cN stage: Cochran Q=0.55 (<italic>P</italic>=.91); cM stage: Cochran Q=8.4 (<italic>P</italic>=.039).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparison to Prior Work</title>
        <p>Radiological image diagnosis using generative AI has been explored across various modalities, including plain radiography, CT, and ultrasound, with reported accuracies varying widely from 27.8% to 88% [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Many studies conclude that generative AI performance remains suboptimal for clinical use. Hong et al [<xref ref-type="bibr" rid="ref33">33</xref>] found that no model achieved clinical-grade applicability for reading chest radiographs due to significant false positives, false negatives, and hallucinations. Our findings align with this body of literature, confirming that substantial advancements are needed before generative AI can be practically applied in this clinical setting (Table S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]). Unlike earlier studies focused on simple classification or structuring textual data, our study targeted a core radiologist workflow: generating diagnostic reports directly from medical images. While LLMs excel at summarizing text and extracting information from existing reports [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>], few studies have explored their ability to derive TNM classifications from images, a task requiring both image interpretation and clinical reasoning. Previous research has often focused on T-factor classification, such as identifying a mass or its size. Our work extends this by comprehensively investigating N and M stage classification of malignant tumors using PET images. Evaluating LLM performance on complex clinical tasks, rather than simple diagnosis, is crucial for assessing their future clinical potential.</p>
        <p>Furthermore, although the models were not provided with explicit segmentation masks, we hypothesized that current VLMs would use their multimodal capabilities [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref29">29</xref>] to map the semantic text label, such as “middle thoracic,” to the corresponding high-uptake region on the MIP image. We hypothesized that the models would interpret this text input as a spatial guide by recognizing anatomical landmarks such as the proximity to the heart, thereby allowing them to distinguish and exclude the primary tumor from other metastatic lesions. This hypothesis was supported by our post hoc qualitative subanalysis, in which the model’s generated reasoning suggested that it identified the primary tumor location based on the provided text prompt.</p>
      </sec>
      <sec>
        <title>Impact on Clinical Management</title>
        <p>The performance gap between physicians and LLMs has significant implications for clinical decision-making. Accurate staging, particularly the detection of nodal and distant metastases, is critical for determining whether patients are candidates for curative surgery versus multimodal therapy. The low sensitivity of LLMs observed in this study (eg, 14% for abdominal LNs by GPT-5) poses a substantial risk of under-staging. In a clinical setting, relying on such a system could lead to the omission of necessary neoadjuvant chemotherapy or the performance of futile surgeries on patients with undetected metastases. In contrast, physicians demonstrated significantly higher sensitivity and balanced accuracy, ensuring that high-risk patients are appropriately identified for systemic treatment. Therefore, while LLMs show promise in specificity, their current lack of sensitivity precludes their utility as a standalone diagnostic tool for treatment planning.</p>
        <p>A key factor limiting LLM performance in medical imaging is the fundamental mismatch between their text-centric design and the demands of visual analysis. As LLMs are primarily trained on textual data, they excel in natural language understanding and reasoning but lack the capability to process and analyze complex visual information [<xref ref-type="bibr" rid="ref32">32</xref>]. This limitation is reflected in the observation that text-based report structuring consistently outperformed direct image-based diagnosis in radiology report generation. To improve accuracy, future research should prioritize architectures that better integrate text and visual data. Incorporating multimodal learning frameworks that combine textual and imaging information might enhance diagnostic performance and facilitate clinical applicability [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations that should be acknowledged.</p>
        <p>First, our dataset was limited by a significant class imbalance, particularly in M-stage classification, where only 27/120 (22.5%) of cases were M1-positive. Consequently, the resulting CIs for sensitivity were wide, and the study may be underpowered to detect significant differences in sensitivity for distant metastases. Such imbalances are known to bias machine learning models toward the majority class, potentially leading to overestimated specificity and underestimated sensitivity. Furthermore, potential image resolution degradation during the conversion and trimming of DICOM files may have impacted the diagnostic accuracy of the LLMs. A more balanced and carefully processed dataset would enable a more robust evaluation of model performance. Additionally, because the LLMs were prompted to provide binary classification outputs (yes/no) in a zero-shot setting rather than continuous probability scores, receiver operating characteristic curve analysis and area under the curve calculation were not feasible in this study.</p>
        <p>Second, the reliance on a single MIP image for each case does not reflect standard clinical practice. MIPs are 2D condensations that can omit crucial spatial and anatomical details necessary for accurate TNM staging, which clinicians typically determine by reviewing multiplanar image slices and integrating information from CT scans. This methodological constraint may have disadvantaged the LLMs when compared with human interpretation. Furthermore, our input was strictly limited to visual information from MIP images and did not include semiquantitative metabolic parameters (eg, SUVmax) or volumetric indices (eg, metabolic tumor volume), which are integral to standard PET/CT interpretation for differentiating malignant from physiological uptake. This is further supported by the variability in diagnostic accuracy observed among physicians within our study, which suggests a potential discrepancy between radiological assessment in this experimental setting and actual clinical workflows.</p>
        <p>Third, the diagnostic criteria were not explicitly defined for either the LLMs or the human evaluators. While a subanalysis requesting the “basis for diagnosis” was performed to check for logical consistency (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), we acknowledge that this generated text itself represents a potential hallucination and is not a substitute for visual attention maps. The subanalysis revealed that while the model often generates text regarding clinical exclusion criteria (eg, ignoring hilar nodes), it remains prone to hallucinations (Case C) and stochastic instability (as was with Case B). The addition of a reasoning prompt paradoxically led to a false-negative result in a previously correctly diagnosed case. Crucially, due to the “black box” nature of commercial APIs, we could not generate saliency maps or obtain reliable bounding box coordinates to verify the models' focus. Consequently, we cannot determine whether the “correct” classifications were achieved based on appropriate anatomical features or simply represent “right answers for the wrong reasons.”</p>
        <p>Fourth, the use of a private, single-institution dataset limits the generalizability of our findings. Differences in imaging protocols, patient populations, and clinical workflows across institutions can significantly affect model performance, making external validation with larger, multicenter datasets essential. However, it is noteworthy that the LLMs were applied in their general-purpose form without task-specific fine-tuning. While differing from traditional machine learning models that are often customized, this approach facilitates performance testing across diverse environments, suggesting that LLMs may be suitable for broader clinical and research applications where consistency and ease of validation are important.</p>
        <p>Finally, this was a single-institution study focusing exclusively on esophageal SCC. Differences in imaging protocols, patient demographics, and disease pathologies across institutions were not accounted for, limiting the external validity of our findings.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Future research should prioritize several key areas to improve diagnostic accuracy and clinical utility. First, overcoming the limitations of 2D MIP images is essential; integrating volumetric data from CT and 3D PET scans is necessary to capture the spatial and anatomical details required for accurate staging. Second, future studies should use multimodal learning frameworks that better synthesize textual clinical data with imaging features, rather than relying only on text-centric architectures. Third, to address the “black box” nature of current models, assessments should include outputs such as bounding boxes or heatmaps to verify that the model is identifying the correct pathology rather than hallucinating lesions.</p>
        <p>Finally, conducting external validation using larger, multi-institutional datasets is crucial to assess generalizability across different imaging protocols and diverse patient populations.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Current general-purpose LLMs, including GPT-5, do not achieve physician-level diagnostic accuracy for esophageal cancer staging based on MIP images. While newer models demonstrate improved specificity and a reduction in hallucinations when compared with those of earlier iterations, their sensitivity for detecting nodal and distant metastases remains insufficient for clinical use. These findings suggest that while LLMs hold potential as future support tools, they currently cannot replace or reliably augment expert radiological assessment in this domain. Future development must prioritize the integration of volumetric data and multimodal capabilities to bridge the notable performance gap observed in this study.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Qualitative Assessment of Diagnostic Reasoning.</p>
        <media xlink:href="cancer_v12i1e86630_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 10 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Comparison of recent studies evaluating multimodal large language models in radiological imaging.</p>
        <media xlink:href="cancer_v12i1e86630_app2.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 13 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">API</term>
          <def>
            <p>application programming interfaces</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">cM</term>
          <def>
            <p>clinical M</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">cN</term>
          <def>
            <p>clinical N</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CT</term>
          <def>
            <p>computed tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FDG-PET</term>
          <def>
            <p>fluorodeoxyglucose positron emission tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LN</term>
          <def>
            <p>lymph node</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MIP</term>
          <def>
            <p>maximum intensity projection</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MCC</term>
          <def>
            <p>Matthews Correlation Coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">PET</term>
          <def>
            <p>positron emission tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SCC</term>
          <def>
            <p>squamous cell carcinoma</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The author used Google Gemini 3 Pro for spell checking and formatting the paper.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data that support the findings of this study are available from the corresponding author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>HM, YT, and KT designed and conceived this study. HM, YT, YA, MI, and YN collected data. HM, YT, YA, and KT analyzed and interpreted the results and drafted the manuscript. All authors read and approved the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bray</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Laversanne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ferlay</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Soerjomataram</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Jemal</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Global cancer statistics 2022: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>
          <source>CA Cancer J Clin</source>
          <year>2024</year>
          <volume>74</volume>
          <issue>3</issue>
          <fpage>229</fpage>
          <lpage>263</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.3322/caac.21834"/>
          </comment>
          <pub-id pub-id-type="doi">10.3322/caac.21834</pub-id>
          <pub-id pub-id-type="medline">38572751</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Ibraheem</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Abdlkadir</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Herrmann</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bomanji</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jadvar</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mansour</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Paez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chiti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy of [18F]FDG PET/MRI in head and neck squamous cell carcinoma: a systematic review and metaanalysis</article-title>
          <source>J Nucl Med</source>
          <year>2024</year>
          <volume>65</volume>
          <issue>10</issue>
          <fpage>1533</fpage>
          <lpage>1539</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://jnm.snmjournals.org/cgi/pmidlookup?view=long&#38;pmid=39266291"/>
          </comment>
          <pub-id pub-id-type="doi">10.2967/jnumed.124.268049</pub-id>
          <pub-id pub-id-type="medline">39266291</pub-id>
          <pub-id pub-id-type="pii">jnumed.124.268049</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>GlobalData Plc</collab>
          </person-group>
          <source>The Complexities of Physician Supply and Demand: Projections from 2021 to 2036</source>
          <year>2021</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>AAMC</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takami</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kodera</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Eguchi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kitago</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Murotani</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hirano</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kitagawa</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ikeda</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mori</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The shortage of surgeons in Japan: results of an online survey of qualified teaching hospitals that take part in the surgical training programs for board certification by the Japan Surgical Society</article-title>
          <source>Surg Today</source>
          <year>2024</year>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <lpage>52</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37193795"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00595-023-02697-7</pub-id>
          <pub-id pub-id-type="medline">37193795</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00595-023-02697-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10764368</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>The Royal College of Radiologists</collab>
          </person-group>
          <article-title>Clinical radiology UK workforce census report 2023</article-title>
          <source>Royal College of Radiologists</source>
          <year>2024</year>
          <access-date>2025-08-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.rcr.ac.uk/media/4imb5jge/_rcr-2024-clinical-radiology-workforce-census-report.pdf">https://www.rcr.ac.uk/media/4imb5jge/_rcr-2024-clinical-radiology-workforce-census-report.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Afshari Mirak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tirumani</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Ramaiya</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>The growing nationwide radiologist shortage: current opportunities and ongoing challenges for international medical graduate radiologists</article-title>
          <source>Radiology</source>
          <year>2025</year>
          <volume>314</volume>
          <issue>3</issue>
          <fpage>e232625</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.232625</pub-id>
          <pub-id pub-id-type="medline">40035678</pub-id>
          <pub-id pub-id-type="pmcid">PMC11950870</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith-Bindman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kwan</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Marlow</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Theis</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Bolch</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Bowles</surname>
              <given-names>EJA</given-names>
            </name>
            <name name-style="western">
              <surname>Duncan</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Greenlee</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Kushi</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Pole</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Rahm</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Stout</surname>
              <given-names>NK</given-names>
            </name>
            <name name-style="western">
              <surname>Weinmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Miglioretti</surname>
              <given-names>DL</given-names>
            </name>
          </person-group>
          <article-title>Trends in use of medical imaging in US health care systems and in Ontario, Canada, 2000-2016</article-title>
          <source>JAMA</source>
          <year>2019</year>
          <volume>322</volume>
          <issue>9</issue>
          <fpage>843</fpage>
          <lpage>856</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31479136"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2019.11456</pub-id>
          <pub-id pub-id-type="medline">31479136</pub-id>
          <pub-id pub-id-type="pii">2749213</pub-id>
          <pub-id pub-id-type="pmcid">PMC6724186</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Frood</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Willaime</surname>
              <given-names>JMY</given-names>
            </name>
            <name name-style="western">
              <surname>Miles</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chambers</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Chalabi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hougham</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Brooks</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Petrides</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Naylor</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sulkin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chaytor</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Strouhal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Scarsbrook</surname>
              <given-names>AF</given-names>
            </name>
          </person-group>
          <article-title>Comparative effectiveness of standard vs. AI-assisted PET/CT reading workflow for pre-treatment lymphoma staging: a multi-institutional reader study evaluation</article-title>
          <source>Front Nucl Med</source>
          <year>2023</year>
          <volume>3</volume>
          <fpage>1327186</fpage>
          <pub-id pub-id-type="doi">10.3389/fnume.2023.1327186</pub-id>
          <pub-id pub-id-type="medline">39355039</pub-id>
          <pub-id pub-id-type="pmcid">PMC11440880</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agress</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>TZ</given-names>
            </name>
            <name name-style="western">
              <surname>Shreve</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Interpretation and reporting of positron emission tomography-computed tomographic scans</article-title>
          <source>Semin Ultrasound CT MR</source>
          <year>2008</year>
          <volume>29</volume>
          <issue>4</issue>
          <fpage>283</fpage>
          <lpage>290</lpage>
          <pub-id pub-id-type="doi">10.1053/j.sult.2008.05.001</pub-id>
          <pub-id pub-id-type="medline">18795496</pub-id>
          <pub-id pub-id-type="pii">S0887-2171(08)00040-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Expert Panels on Thoracic and Gastrointestinal Imaging</collab>
            <name name-style="western">
              <surname>Raptis</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Porter</surname>
              <given-names>KK</given-names>
            </name>
            <name name-style="western">
              <surname>Catenacci</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Kuzniewski</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Long</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Sandler</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Sirajuddin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Surasi</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>GW</given-names>
            </name>
            <name name-style="western">
              <surname>Kamel</surname>
              <given-names>IR</given-names>
            </name>
            <name name-style="western">
              <surname>Donnelly</surname>
              <given-names>EF</given-names>
            </name>
          </person-group>
          <article-title>ACR appropriateness criteria® staging and follow-up of esophageal cancer</article-title>
          <source>J Am Coll Radiol</source>
          <year>2022</year>
          <volume>19</volume>
          <issue>11S</issue>
          <fpage>S462</fpage>
          <lpage>S472</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://escholarship.org/uc/item/qt3tz2c140"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jacr.2022.09.008</pub-id>
          <pub-id pub-id-type="medline">36436970</pub-id>
          <pub-id pub-id-type="pii">S1546-1440(22)00644-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Obermannová</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alsina</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cervantes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Leong</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lordick</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Nilsson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van Grieken</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Vogel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Smyth</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Oesophageal cancer: ESMO clinical practice guideline for diagnosis, treatment and follow-up</article-title>
          <source>Ann Oncol</source>
          <year>2022</year>
          <volume>33</volume>
          <issue>10</issue>
          <fpage>992</fpage>
          <lpage>1004</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0923-7534(22)01850-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.annonc.2022.07.003</pub-id>
          <pub-id pub-id-type="medline">35914638</pub-id>
          <pub-id pub-id-type="pii">S0923-7534(22)01850-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Ibraheem</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Abdlkadir</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Shagera</surname>
              <given-names>QA</given-names>
            </name>
            <name name-style="western">
              <surname>Saraireh</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Adhami</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Rashdan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Anwar</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Moghrabi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamad</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Muylle</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Estrada</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Paez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mansour</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lopci</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>The diagnostic and predictive value of 18F-fluorodeoxyglucose positron emission tomography/computed tomography in laryngeal squamous cell carcinoma</article-title>
          <source>Cancers (Basel)</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>22</issue>
          <fpage>5461</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=cancers15225461"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/cancers15225461</pub-id>
          <pub-id pub-id-type="medline">38001720</pub-id>
          <pub-id pub-id-type="pii">cancers15225461</pub-id>
          <pub-id pub-id-type="pmcid">PMC10670341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Geus-Oei</surname>
              <given-names>LF</given-names>
            </name>
            <name name-style="western">
              <surname>Oyen</surname>
              <given-names>WJ</given-names>
            </name>
          </person-group>
          <article-title>Predictive and prognostic value of FDG-PET</article-title>
          <source>Cancer Imaging</source>
          <year>2008</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>70</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/18390390"/>
          </comment>
          <pub-id pub-id-type="doi">10.1102/1470-7330.2008.0010</pub-id>
          <pub-id pub-id-type="medline">18390390</pub-id>
          <pub-id pub-id-type="pmcid">PMC2324370</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maruyama</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Toyama</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Takanami</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Takase</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kamei</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Role of artificial intelligence in surgical training by assessing GPT-4 and GPT-4o on the Japan surgical board examination with text-only and image-accompanied questions: performance evaluation study</article-title>
          <source>JMIR Med Educ</source>
          <year>2025</year>
          <volume>11</volume>
          <fpage>e69313</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2025//e69313/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/69313</pub-id>
          <pub-id pub-id-type="medline">40737609</pub-id>
          <pub-id pub-id-type="pii">v11i1e69313</pub-id>
          <pub-id pub-id-type="pmcid">PMC12310146</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dehdab</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Brendlin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Werner</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Almansour</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gassenmaier</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brendel</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Nikolaou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Afat</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Evaluating GPT-4V in chest ct diagnostics: a critical image interpretation assessment</article-title>
          <source>Jpn J Radiol</source>
          <year>2024</year>
          <volume>42</volume>
          <issue>10</issue>
          <fpage>1168</fpage>
          <lpage>1177</lpage>
          <pub-id pub-id-type="doi">10.1007/s11604-024-01606-3</pub-id>
          <pub-id pub-id-type="medline">38867035</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11604-024-01606-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC11442562</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chambara</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>SYW</given-names>
            </name>
            <name name-style="western">
              <surname>Gunda</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>MTC</given-names>
            </name>
          </person-group>
          <article-title>Assessing the feasibility of GPT-4o and Claude 3-Opus in thyroid nodule classification based on ultrasound images</article-title>
          <source>Endocrine</source>
          <year>2024</year>
          <volume>87</volume>
          <issue>3</issue>
          <fpage>1041</fpage>
          <lpage>1049</lpage>
          <pub-id pub-id-type="doi">10.1007/s12020-024-04066-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chetla</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tandon</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sukhija</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Evaluating GPT's efficacy in pediatric pneumonia detection from chest X-rays: comparative analysis of specialized aI models</article-title>
          <source>JMIR AI</source>
          <year>2025</year>
          <volume>4</volume>
          <fpage>e67621</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ai.jmir.org/2025//e67621/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/67621</pub-id>
          <pub-id pub-id-type="medline">39793007</pub-id>
          <pub-id pub-id-type="pii">v4i1e67621</pub-id>
          <pub-id pub-id-type="pmcid">PMC11759907</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Kwon</surname>
              <given-names>YE</given-names>
            </name>
          </person-group>
          <article-title>Validation of a deep learning chest X-ray interpretation model: integrating large-scale AI and large language models for comparative analysis with GPT</article-title>
          <source>Diagnostics (Basel)</source>
          <year>2023</year>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>90</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=diagnostics14010090"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/diagnostics14010090</pub-id>
          <pub-id pub-id-type="medline">38201398</pub-id>
          <pub-id pub-id-type="pii">diagnostics14010090</pub-id>
          <pub-id pub-id-type="pmcid">PMC10795741</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mongan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Moy</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kahn</surname>
              <given-names>CE</given-names>
            </name>
          </person-group>
          <article-title>Checklist for artificial intelligence in medical imaging (CLAIM): a guide for authors and reviewers</article-title>
          <source>Radiol Artif Intell</source>
          <year>2020</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e200029</fpage>
          <pub-id pub-id-type="doi">10.1148/ryai.2020200029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brierley</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Gospodarowicz</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Wittekind</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <source>TNM Classification of Malignant Tumours</source>
          <year>2016</year>
          <publisher-loc>Hoboken</publisher-loc>
          <publisher-name>Wiley-Blackwell</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Girum</surname>
              <given-names>KB</given-names>
            </name>
            <name name-style="western">
              <surname>Rebaud</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cottereau</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Meignan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clerc</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vercellino</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Casasnovas</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Morschhauser</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Thieblemont</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Buvat</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>18F-FDG PET maximum-intensity projections and artificial intelligence: a win-win combination to easily measure prognostic biomarkers in DLBCL patients</article-title>
          <source>J Nucl Med</source>
          <year>2022</year>
          <volume>63</volume>
          <issue>12</issue>
          <fpage>1925</fpage>
          <lpage>1932</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://jnm.snmjournals.org/cgi/pmidlookup?view=long&#38;pmid=35710733"/>
          </comment>
          <pub-id pub-id-type="doi">10.2967/jnumed.121.263501</pub-id>
          <pub-id pub-id-type="medline">35710733</pub-id>
          <pub-id pub-id-type="pii">jnumed.121.263501</pub-id>
          <pub-id pub-id-type="pmcid">PMC9730929</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mine</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tanaka</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kawachi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shirakawa</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kitagawa</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Toh</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yasuda</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Watanabe</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kamei</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Oyama</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Seto</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Murakami</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Arai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Muto</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Doki</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Japanese classification of esophageal cancer</article-title>
          <source>Esophagus</source>
          <year>2024</year>
          <volume>21</volume>
          <issue>3</issue>
          <fpage>179</fpage>
          <lpage>215</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38568243"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10388-024-01054-y</pub-id>
          <pub-id pub-id-type="medline">38568243</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10388-024-01054-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC11199297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <article-title>Usage policies</article-title>
          <source>OpenAI</source>
          <access-date>2025-05-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://OpenAI.com/policies/usage-policies">https://OpenAI.com/policies/usage-policies</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dogra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Silva</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>The financial, operational, and clinical advantages of generalist radiology AI</article-title>
          <source>Radiology</source>
          <year>2025</year>
          <volume>316</volume>
          <issue>3</issue>
          <fpage>e242362</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.242362</pub-id>
          <pub-id pub-id-type="medline">40923883</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <article-title>Evaluation of multimodal generative AI for esophageal cancer staging using FDG-PET: diagnostic accuracy and comparison with physicians</article-title>
          <source>GitHub</source>
          <access-date>2025-10-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/YuyaAraki/chatgpt-nuclear-imaging">https://github.com/YuyaAraki/chatgpt-nuclear-imaging</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Mapping of lymph node metastasis from thoracic esophageal cancer: a retrospective study</article-title>
          <source>Ann Surg Oncol</source>
          <year>2022</year>
          <volume>29</volume>
          <issue>9</issue>
          <fpage>5681</fpage>
          <lpage>5688</lpage>
          <pub-id pub-id-type="doi">10.1245/s10434-022-11867-9</pub-id>
          <pub-id pub-id-type="medline">35543907</pub-id>
          <pub-id pub-id-type="pii">10.1245/s10434-022-11867-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chicco</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jurman</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>The advantages of the matthews correlation coefficient (MCC) over F1 score and accuracy in binary classification evaluation</article-title>
          <source>BMC Genomics</source>
          <year>2020</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>6</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12864-019-6413-7</pub-id>
          <pub-id pub-id-type="medline">31898477</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12864-019-6413-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6941312</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Landis</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Koch</surname>
              <given-names>GG</given-names>
            </name>
          </person-group>
          <article-title>The measurement of observer agreement for categorical data</article-title>
          <source>Biometrics</source>
          <year>1977</year>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>159</fpage>
          <lpage>174</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.2307/2529310"/>
          </comment>
          <pub-id pub-id-type="doi">10.2307/2529310</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McNemar</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Note on the sampling error of the difference between correlated proportions or percentages</article-title>
          <source>Psychometrika</source>
          <year>1947</year>
          <volume>12</volume>
          <issue>2</issue>
          <fpage>153</fpage>
          <lpage>157</lpage>
          <pub-id pub-id-type="doi">10.1007/bf02295996</pub-id>
          <pub-id pub-id-type="medline">20254758</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <source>GPT-5 is here</source>
          <access-date>2025-08-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://OpenAI.com/gpt-5">https://OpenAI.com/gpt-5</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Yun</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>HS</given-names>
            </name>
          </person-group>
          <article-title>Comparative analysis of M4CXR, an LLM-based chest X-ray report generation model, and GPT in radiological interpretation</article-title>
          <source>J Clin Med</source>
          <year>2024</year>
          <volume>13</volume>
          <issue>23</issue>
          <fpage>7057</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=jcm13237057"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/jcm13237057</pub-id>
          <pub-id pub-id-type="medline">39685515</pub-id>
          <pub-id pub-id-type="pii">jcm13237057</pub-id>
          <pub-id pub-id-type="pmcid">PMC11642207</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reith</surname>
              <given-names>TP</given-names>
            </name>
            <name name-style="western">
              <surname>D’Alessandro</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>D’Alessandro</surname>
              <given-names>MP</given-names>
            </name>
          </person-group>
          <article-title>Capability of multimodal large language models to interpret pediatric radiological images</article-title>
          <source>Pediatr Radiol</source>
          <year>2024</year>
          <volume>54</volume>
          <issue>10</issue>
          <fpage>1729</fpage>
          <lpage>1737</lpage>
          <pub-id pub-id-type="doi">10.1007/s00247-024-06025-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>EK</given-names>
            </name>
            <name name-style="western">
              <surname>Ham</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Roh</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Eom</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bae</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>OK</given-names>
            </name>
            <name name-style="western">
              <surname>Bae</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Suh</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>TH</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy and clinical value of a domain-specific multimodal generative AI model for chest radiograph report generation</article-title>
          <source>Radiology</source>
          <year>2025</year>
          <volume>314</volume>
          <issue>3</issue>
          <fpage>e241476</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.241476</pub-id>
          <pub-id pub-id-type="medline">40131111</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>The potential of gemini and GPTs for structured report generation based on free-text 18F-FDG PET/CT breast cancer reports</article-title>
          <source>Acad Radiol</source>
          <year>2025</year>
          <volume>32</volume>
          <issue>2</issue>
          <fpage>624</fpage>
          <lpage>633</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1076-6332(24)00615-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.acra.2024.08.052</pub-id>
          <pub-id pub-id-type="medline">39245597</pub-id>
          <pub-id pub-id-type="pii">S1076-6332(24)00615-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salam</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kravchenko</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nowak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sprinkart</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Weinhold</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Odenthal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mesropyan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Bischoff</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Attenberger</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Kuetting</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Luetkens</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Isaak</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Generative pre-trained transformer 4 makes cardiovascular magnetic resonance reports easy to understand</article-title>
          <source>J Cardiovasc Magn Reson</source>
          <year>2024</year>
          <volume>26</volume>
          <issue>1</issue>
          <fpage>101035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1097-6647(24)01026-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jocmr.2024.101035</pub-id>
          <pub-id pub-id-type="medline">38460841</pub-id>
          <pub-id pub-id-type="pii">S1097-6647(24)01026-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC10981113</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Figueredo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>WE</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A survey of deep-learning-based radiology report generation using multimodal inputs</article-title>
          <source>Med Image Anal</source>
          <year>2025</year>
          <volume>103</volume>
          <fpage>103627</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1361-8415(25)00174-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.media.2025.103627</pub-id>
          <pub-id pub-id-type="medline">40382855</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(25)00174-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clusmann</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kolbinger</surname>
              <given-names>FR</given-names>
            </name>
            <name name-style="western">
              <surname>Muti</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Carrero</surname>
              <given-names>ZI</given-names>
            </name>
            <name name-style="western">
              <surname>Eckardt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Laleh</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Löffler</surname>
              <given-names>CML</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarzkopf</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Unger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Veldhuizen</surname>
              <given-names>GP</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kather</surname>
              <given-names>JN</given-names>
            </name>
          </person-group>
          <article-title>The future landscape of large language models in medicine</article-title>
          <source>Commun Med (Lond)</source>
          <year>2023</year>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>141</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s43856-023-00370-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id>
          <pub-id pub-id-type="medline">37816837</pub-id>
          <pub-id pub-id-type="pii">10.1038/s43856-023-00370-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC10564921</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
