<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Cancer</journal-id><journal-id journal-id-type="publisher-id">cancer</journal-id><journal-id journal-id-type="index">21</journal-id><journal-title>JMIR Cancer</journal-title><abbrev-journal-title>JMIR Cancer</abbrev-journal-title><issn pub-type="epub">2369-1999</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e78221</article-id><article-id pub-id-type="doi">10.2196/78221</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Models for Supporting Clear Writing and Detecting Spin in Randomized Controlled Trials in Oncology: Comparative Analysis of GPT Models and Prompts</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Koechli</surname><given-names>Carole</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dennst&#x00E4;dt</surname><given-names>Fabio</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schr&#x00F6;der</surname><given-names>Christina</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Aebersold</surname><given-names>Daniel M</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>F&#x00F6;rster</surname><given-names>Robert</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zwahlen</surname><given-names>Daniel R</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Windisch</surname><given-names>Paul</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Radiation Oncology, Kantonsspital Winterthur</institution><addr-line>Brauerstrasse 15</addr-line><addr-line>Winterthur</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>Department of Radiation Oncology, Inselspital, Bern University Hospital, University of Bern</institution><addr-line>Bern</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Cahill</surname><given-names>Naomi</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hu</surname><given-names>Danqing</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Williams-Nguyen</surname><given-names>Jessica</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Matsuda</surname><given-names>Shinichi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Paul Windisch, MD, Department of Radiation Oncology, Kantonsspital Winterthur, Brauerstrasse 15, Winterthur, Switzerland, 41 52 266 26 53; <email>paul.windisch@ksw.ch</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>21</day><month>1</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e78221</elocation-id><history><date date-type="received"><day>28</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>16</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Carole Koechli, Fabio Dennst&#x00E4;dt, Christina Schr&#x00F6;der, Daniel M Aebersold, Robert F&#x00F6;rster, Daniel R Zwahlen, Paul Windisch. Originally published in JMIR Cancer (<ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org">https://cancer.jmir.org</ext-link>), 21.1.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cancer, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org/">https://cancer.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://cancer.jmir.org/2026/1/e78221"/><abstract><sec><title>Background</title><p>Randomized controlled trials (RCTs) are the gold standard for evaluating interventions in oncology, but reporting can be subject to &#x201C;spin&#x201D;&#x2014;presenting results in ways that mislead readers about true efficacy.</p></sec><sec><title>Objective</title><p>This study aimed to investigate whether large language models (LLMs) could provide a standardized approach to detect spin, particularly in the conclusions, where it most commonly occurs.</p></sec><sec sec-type="methods"><title>Methods</title><p>We randomly sampled 250 two-arm, single&#x2013;primary end point oncology RCTs from 7 major medical journals published between 2005 and 2023. Two authors independently annotated trials as positive or negative based on whether they met their primary end point. Three commercial LLMs (GPT-3.5 Turbo, GPT-4o, and GPT-o1) were tasked with classifying trials as positive or negative when provided with (1) conclusions only; (2) methods and conclusions; (3) methods, results, and conclusions; or (4) title and full abstract. LLM performance was evaluated against human annotations. Afterward, trials incorrectly classified as positive when the model was provided only with the conclusions but correctly classified as negative when provided with the whole abstract were analyzed for patterns that may indicate the presence of spin. Model performance was assessed using accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score calculated from confusion matrices.</p></sec><sec sec-type="results"><title>Results</title><p>Of the 250 trials, 146 (58.4%) were positive, and 104 (41.6%) were negative. The GPT-o1 model demonstrated the highest performance across all conditions, with <italic>F</italic><sub>1</sub>-scores of 0.932 (conclusions only; 95% CI 0.90-0.96), 0.96 (methods and conclusions; 95% CI 0.93-0.98), 0.98 (methods, results, and conclusions; 95% CI 0.96-0.99), and 0.97 (title and abstract; 95% CI 0.95-0.99). Analysis of trials incorrectly classified as positive when the model was provided only with the conclusions revealed shared patterns, including absence of primary end point results, emphasis on subgroup improvements, or unclear distinction between primary and secondary end points. These patterns were almost never found in trials correctly classified as negative.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs can effectively detect potential spin in oncology RCT reporting by identifying discrepancies between how trials are presented in the conclusions vs the full abstracts. This approach could serve as a supplementary tool for improving transparency in scientific reporting, although further development is needed to address more complex trial designs beyond those examined in this feasibility study.</p></sec></abstract><kwd-group><kwd>spin</kwd><kwd>randomized controlled trials</kwd><kwd>large language models</kwd><kwd>data mining</kwd><kwd>natural language processing</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Randomized controlled trials (RCTs) represent the gold standard for evaluating interventions in oncology [<xref ref-type="bibr" rid="ref1">1</xref>]. However, the reporting and interpretation of trial results can be subject to inconsistency and &#x201C;spin&#x201D;&#x2014;the presentation of results in a way that may mislead readers about the true efficacy of interventions [<xref ref-type="bibr" rid="ref2">2</xref>]. This can, for example, be accomplished by emphasizing secondary end points or subgroup analyses when primary end points are not met. While most research that has looked at the topic has found a substantial prevalence of spin, the exact number varies as it is not always straightforward to differentiate between what constitutes a balanced and comprehensive presentation of the results and what may be an attempt to mislead the reader [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>The presence of spin has important implications. Clinicians, policymakers, and even patients often rely heavily on abstracts and conclusions when interpreting trial findings, as full-text analyses are time-consuming and not always accessible. Therefore, misrepresentation of results might contribute to overly optimistic perceptions of treatment benefits, potentially influencing clinical decision-making, guideline development, and even the allocation of research funding. Given the increasing complexity of cancer care and the rapidly expanding volume of clinical trials, ensuring clarity and accuracy in scientific reporting is crucial to avoid bias in evidence synthesis and translation into practice.</p><p>The growing capabilities of large language models (LLMs) could constitute a standardized way to determine the presence of spin. If an abstract is clearly written, a state-of-the-art LLM should be able to determine whether its primary end point was met. As multiple studies have identified the conclusions as the most frequent source of spin [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], we hypothesized that trials which are correctly classified as negative&#x2014;defined as trials that did not meet their primary end point&#x2014;by an LLM when provided with the title and abstract, but incorrectly classified as positive when provided with only the conclusions, would be likely to contain some form of spin. Therefore, the aim of this study was to evaluate whether LLMs can reliably classify oncology RCTs as positive or negative and whether discrepancies between conclusion-only and full-abstract classifications can help identify patterns consistent with spin.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>Randomized controlled oncology trials from 7 major medical journals (<italic>British Medical Journal</italic>, <italic>Journal of the American Medical Association</italic>, <italic>Journal of the American Medical Association Oncology</italic>, <italic>Journal of Clinical Oncology</italic>, <italic>The Lancet</italic>, <italic>The Lancet Oncology</italic>, and <italic>The New England Journal of Medicine</italic>) published between 2005 and 2023 were randomly sampled by downloading the available abstracts for the time frame via PubMed in a text file and parsing the abstracts using regular expressions. These 7 journals were selected because they publish a large and consistent volume of oncology RCTs and are widely regarded as leading general or oncology-specific medical journals. The 2005 to 2023 range was chosen to capture contemporary trial reporting practices while ensuring sufficient volume across all selected journals. To avoid edge cases for this feasibility study, it was decided to limit the eligible trials to designs with exactly 2 arms and 1 primary end point.</p><p>We aimed to sample 250 trials as this number ensured a sufficiently large dataset for the feasibility analysis while remaining feasible for manual dual annotation. Trials were sampled by creating a randomized list of all retrieved abstracts. Two authors (CK and PW) then started the annotation from the top of the random list and stopped after 250 two-arm, single&#x2013;primary end point oncology trials had been annotated. No journal-level quotas were applied.</p><p>The purpose of the annotation was to establish the ground-truth classification&#x2014;whether the trial met its primary end point&#x2014;against which model predictions could be evaluated. The annotation was conducted in a 2-step process. After annotating the first 20 trials, all samples were discussed to recognize potential differences in the annotation criteria. The remaining trials were annotated separately, and discrepancies were discussed after all trials had been annotated. A third author (DRZ) would have been responsible for judging disagreements that persisted after discussion. However, this was not necessary. The annotation was performed using the Prodigy tool (version 1.13.1; Explosion), which only showed the extracted abstract as text without any additional information such as authors or institutions. Only in cases in which the abstract did not clearly state the primary end point and its results did we refer to the full publication or protocol. Three commercially available LLMs, namely, GPT-3.5 Turbo, GPT-4o, and GPT-o1 (OpenAI), were then tasked with classifying the trials as positive or negative. The 3 models were chosen to investigate whether the inherent capabilities of the models would impact their suitability for the classification task (eg, simpler models requiring more explicit language to correctly identify trials) and, thus, their performance when trying to leverage differences in classification accuracy to detect unclear writing and spin. The decision to use OpenAI models was based on the prevalent use of these models at the time as well as the convenience of application programming interface access and lack of privacy concerns regarding the study data. The respective model snapshots were gpt-3.5-turbo-0125, gpt-4o-2024-11-20, and o1-2024-12-17. The LLMs were called via the application programming interface, with the temperature parameter set to 1. We refrained from performing multiple classification runs as a previous study from the same research group had shown very consistent performance by LLMs for both classification and named-entity recognition tasks, as long as the temperature was kept at or below 1.50 [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Each model was evaluated in 4 different rounds. In round 1, the models were only provided with the conclusions of the abstract. In round 2, the models were provided with the methods and conclusions of the abstract. In round 3, the models were provided with the methods, results, and conclusions of the abstract. In round 4, the models were provided with the title and the full abstract.</p><p>The following system prompt (ie, the fixed instruction provided to the model to define its task) was used: &#x201C;You will be provided with the {section} of a randomized controlled oncology trial. Your task will be to classify if the trial was positive, i.e. if it met its primary endpoint, or negative, i.e. if it did not meet its primary endpoint. Your response should be either the word POSITIVE (in all caps) or NEGATIVE (in all caps).&#x201D;</p><p>The &#x201C;{section}&#x201D; part was replaced with either &#x201C;conclusion,&#x201D; &#x201C;methods and conclusion,&#x201D; &#x201C;methods, results, and conclusion,&#x201D; or &#x201C;title and abstract.&#x201D; The user prompt (ie, the specific input text) was the corresponding title, abstract, or sections of the abstract.</p><p>The prompts were designed to be as explicit as possible regarding the definition of a positive trial to minimize ambiguity and ensure consistent model behavior across conditions. However, we did not conduct a systematic comparison of different prompts.</p></sec><sec id="s2-2"><title>Statistical Analysis</title><p>Interannotator agreement was calculated as the percentage of agreement divided by the total number of annotated trials.</p><p>The results were evaluated against the ground truth (ie, the human-annotated classification of whether the trial met its primary end point) by creating confusion matrices and computing several performance metrics to obtain a holistic picture of model performance. These included accuracy (the proportion of correctly classified trials among all trials), precision (the proportion of predicted positive trials that were truly positive; equivalent to positive predictive value), recall (the proportion of truly positive trials that were correctly predicted as positive; equivalent to sensitivity), and <italic>F</italic><sub>1</sub>-score (the harmonic mean of precision and recall). For completeness, specificity (true negative rate), and negative predictive value can also be derived from the confusion matrix but were not separately reported. The 95% CIs were estimated using normal approximation intervals. For the best-performing model, we further analyzed and categorized the trials that were incorrectly predicted as positive when provided with the conclusions but were correctly predicted as negative when provided with the title and abstract. For these trials, a single author (PW) reviewed the full conclusions and abstracts to categorize the patterns leading to incorrect classification (eg, omission of primary end point, emphasis on subgroup findings, or unclear distinction between end points). To contextualize these findings, we additionally selected 10 randomly chosen trials correctly classified as negative by GPT-o1 and performed the same qualitative assessment. All programming was performed in Python (Python Software Foundation; version 3.13.2) using, among others, the <italic>pandas</italic> (version 2.2.3) and <italic>openai</italic> (version 1.67.0) packages.</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study used publicly available abstracts from published clinical trials. All data were deidentified and contained no patient-level information; therefore, ethics approval was not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Interannotator agreement was 97.2% (243/250). All of the disagreements were caused by simple mistakes and could be easily resolved during the discussion. Ultimately, 58.4% (146/250) of the trials were annotated as positive, and 41.6% (104/250) were annotated as negative.</p><p>The performances of the models when provided with different sections of the abstract are shown in <xref ref-type="fig" rid="figure1">Figure 1</xref> and <xref ref-type="table" rid="table1">Table 1</xref>. GPT-o1 exhibited the best performance in each round, with <italic>F</italic><sub>1</sub>-scores of 0.932 (conclusions only), 0.96 (methods and conclusions), 0.98 (methods, results, and conclusions), and 0.97 (title and abstract). GPT-4o&#x2019;s <italic>F</italic><sub>1</sub>-scores across the 4 rounds were 0.89, 0.91, 0.94, and 0.94, respectively. GPT-3.5 Turbo exhibited <italic>F</italic><sub>1</sub>-scores of 0.89, 0.92, 0.91, and 0.91, respectively.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Confusion matrices. Classification performance of GPT-3.5 Turbo, GPT-4o, and GPT-o1 when predicting whether a trial was positive or negative based on different sections of the abstract.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v12i1e78221_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Classification performance. Accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score for GPT-3.5 Turbo, GPT-4o, and GPT-o1 when predicting whether a trial was positive based on different sections of the abstract.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy (95% CI)</td><td align="left" valign="bottom">Precision (95% CI)</td><td align="left" valign="bottom">Recall (95% CI)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Conclusions only</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-3.5 Turbo</td><td align="char" char="." valign="top">0.87 (0.83&#x2010;0.91)</td><td align="char" char="." valign="top">0.86 (0.82&#x2010;0.90)</td><td align="char" char="." valign="top">0.93 (0.90&#x2010;0.96)</td><td align="char" char="." valign="top">0.89 (0.86&#x2010;0.93)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="char" char="." valign="top">0.87 (0.83&#x2010;0.91)</td><td align="char" char="." valign="top">0.91 (0.87&#x2010;0.94)</td><td align="char" char="." valign="top">0.87 (0.83&#x2010;0.91)</td><td align="char" char="." valign="top">0.89 (0.85&#x2010;0.93)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-o1</td><td align="char" char="." valign="top">0.92 (0.89&#x2010;0.95)</td><td align="char" char="." valign="top">0.92 (0.89&#x2010;0.95)</td><td align="char" char="." valign="top">0.95 (0.92&#x2010;0.97)</td><td align="char" char="." valign="top">0.93 (0.90&#x2010;0.96)</td></tr><tr><td align="left" valign="top" colspan="5">Methods+conclusions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-3.5 Turbo</td><td align="char" char="." valign="top">0.90 (0.86&#x2010;0.94)</td><td align="char" char="." valign="top">0.90 (0.86&#x2010;0.93)</td><td align="char" char="." valign="top">0.94 (0.91&#x2010;0.97)</td><td align="char" char="." valign="top">0.92 (0.88&#x2010;0.95)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="char" char="." valign="top">0.90 (0.86&#x2010;0.94)</td><td align="char" char="." valign="top">0.94 (0.91&#x2010;0.97)</td><td align="char" char="." valign="top">0.88 (0.84&#x2010;0.92)</td><td align="char" char="." valign="top">0.91 (0.88&#x2010;0.95)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-o1</td><td align="char" char="." valign="top">0.95 (0.93&#x2010;0.98)</td><td align="char" char="." valign="top">0.95 (0.93&#x2010;0.98)</td><td align="char" char="." valign="top">0.97 (0.94&#x2010;0.99)</td><td align="char" char="." valign="top">0.96 (0.93&#x2010;0.98)</td></tr><tr><td align="left" valign="top" colspan="5">Methods+results+conclusions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-3.5 Turbo</td><td align="char" char="." valign="top">0.90 (0.86&#x2010;0.93)</td><td align="char" char="." valign="top">0.91 (0.88&#x2010;0.95)</td><td align="char" char="." valign="top">0.91 (0.88&#x2010;0.95)</td><td align="char" char="." valign="top">0.91 (0.88&#x2010;0.95)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="char" char="." valign="top">0.94 (0.91&#x2010;0.97)</td><td align="char" char="." valign="top">0.99 (0.97&#x2010;1.00)</td><td align="char" char="." valign="top">0.90 (0.87&#x2010;0.94)</td><td align="char" char="." valign="top">0.94 (0.91&#x2010;0.97)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-o1</td><td align="char" char="." valign="top">0.97 (0.95&#x2010;0.99)</td><td align="char" char="." valign="top">0.98 (0.96&#x2010;1.00)</td><td align="char" char="." valign="top">0.97 (0.95&#x2010;0.99)</td><td align="char" char="." valign="top">0.98 (0.96&#x2010;0.99)</td></tr><tr><td align="left" valign="top" colspan="5">Title+abstract</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-3.5 Turbo</td><td align="char" char="." valign="top">0.90 (0.86&#x2010;0.93)</td><td align="char" char="." valign="top">0.90 (0.86&#x2010;0.94)</td><td align="char" char="." valign="top">0.92 (0.89&#x2010;0.96)</td><td align="char" char="." valign="top">0.91 (0.88&#x2010;0.95)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="char" char="." valign="top">0.93 (0.90&#x2010;0.96)</td><td align="char" char="." valign="top">0.96 (0.94&#x2010;0.99)</td><td align="char" char="." valign="top">0.91 (0.88&#x2010;0.95)</td><td align="char" char="." valign="top">0.94 (0.91&#x2010;0.97)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-o1</td><td align="char" char="." valign="top">0.97 (0.95&#x2010;0.99)</td><td align="char" char="." valign="top">0.98 (0.96&#x2010;1.00)</td><td align="char" char="." valign="top">0.97 (0.94&#x2010;0.99)</td><td align="char" char="." valign="top">0.97 (0.95&#x2010;0.99)</td></tr></tbody></table></table-wrap><p>We further analyzed trials that were incorrectly predicted as positive by GPT-o1 when the model was only provided with the conclusions but predicted correctly when provided with the title and abstract. Of these 10 trials, 6 (60%) did not mention the primary end point in the conclusions [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. One mentioned an improvement in the primary end point in a subgroup [<xref ref-type="bibr" rid="ref13">13</xref>]. One mentioned both improved secondary end points and the unimproved primary end point without specifying which was which [<xref ref-type="bibr" rid="ref14">14</xref>]. The remaining 2 trials mentioned that one arm was superior to the other one without specifying that it was the control arm that showed improved results [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. The list of trials is provided in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Trials that were incorrectly predicted as positive by GPT-o1 when the model was only provided with the conclusions but predicted correctly when provided with the title and abstract.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Title</td><td align="left" valign="bottom">Conclusions reported on primary end point</td><td align="left" valign="bottom">Possible reason for incorrect prediction</td></tr></thead><tbody><tr><td align="left" valign="top">&#x201C;Total Body Irradiation or Chemotherapy Conditioning in Childhood ALL: A Multinational, Randomized, Noninferiority Phase III Study&#x201D; [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Conclusions mentioned that TBI<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> plus etoposide showed improved overall survival. Therefore, the model likely thought that TBI plus etoposide was the intervention that was tested, whereas it was actually the control.</td></tr><tr><td align="left" valign="top">&#x201C;Volasertib Versus Chemotherapy in Platinum-Resistant or -Refractory Ovarian Cancer: A Randomized Phase II Groupe des Investigateurs Nationaux pour l&#x2019;Etude des Cancers de l&#x2019;Ovaire Study&#x201D; [<xref ref-type="bibr" rid="ref7">7</xref>]</td><td align="left" valign="top">No</td><td align="left" valign="top">Primary end point was not discussed in the conclusions.</td></tr><tr><td align="left" valign="top">&#x201C;High-Dose Therapy and Autologous Blood Stem-Cell Transplantation Compared With Conventional Treatment in Myeloma Patients Aged 55 to 65 Years: Long-Term Results of a Randomized Control Trial From the Group Myelome-Autogreffe&#x201D; [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Conclusions mentioned both improved secondary end points and the unimproved primary end point without specifying which was which.</td></tr><tr><td align="left" valign="top">&#x201C;Results of a Randomized Trial of Chlorambucil Versus Fludarabine for Patients With Untreated Waldenstr&#x00F6;m Macroglobulinemia, Marginal Zone Lymphoma, or Lymphoplasmacytic Lymphoma&#x201D; [<xref ref-type="bibr" rid="ref8">8</xref>]</td><td align="left" valign="top">No</td><td align="left" valign="top">Primary end point was not mentioned in the conclusions.</td></tr><tr><td align="left" valign="top">&#x201C;Bortezomib-Dexamethasone, Rituximab, and Cyclophosphamide as First-Line Treatment for Waldenstr&#x00F6;m&#x2019;s Macroglobulinemia: A Prospectively Randomized Trial of the European Consortium for Waldenstr&#x00F6;m&#x2019;s Macroglobulinemia&#x201D; [<xref ref-type="bibr" rid="ref9">9</xref>]</td><td align="left" valign="top">No</td><td align="left" valign="top">Primary end point was not mentioned in the conclusions.</td></tr><tr><td align="left" valign="top">&#x201C;Adjuvant tamoxifen and exemestane in early breast cancer (TEAM): a randomised phase 3 trial&#x201D; [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">No</td><td align="left" valign="top">Primary end point was not mentioned in the conclusions.</td></tr><tr><td align="left" valign="top">&#x201C;Addition of Bevacizumab to Bolus Fluorouracil and Leucovorin in First-Line Metastatic Colorectal Cancer: Results of a Randomized Phase II Trial&#x201D; [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">No</td><td align="left" valign="top">Primary end point was not mentioned in the conclusions.</td></tr><tr><td align="left" valign="top">&#x201C;Oral ibandronic acid versus intravenous zoledronic acid in treatment of bone metastases from breast cancer: a randomised, open label, non-inferiority phase 3 trial&#x201D; [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Conclusions mentioned the superiority of zoledronic acid. Therefore, the model likely thought that zoledronic acid was the intervention, whereas it was the comparator in this noninferiority study.</td></tr><tr><td align="left" valign="top">&#x201C;Bcl-2 Antisense (oblimersen sodium) Plus Dacarbazine in Patients With Advanced Melanoma: The Oblimersen Melanoma Study Group&#x201D; [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Improvement in the primary end point in a subgroup was mentioned.</td></tr><tr><td align="left" valign="top">&#x201C;Efficacy and Safety of Trabectedin or Dacarbazine for Metastatic Liposarcoma or Leiomyosarcoma After Failure of Conventional Chemotherapy: Results of a Phase III Randomized Multicenter Clinical Trial&#x201D; [<xref ref-type="bibr" rid="ref11">11</xref>]</td><td align="left" valign="top">No</td><td align="left" valign="top">Primary end point was not mentioned in the conclusions.</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>TBI: total body irradiation.</p></fn></table-wrap-foot></table-wrap><p>To confirm that those writing patterns were not equally frequent in trials correctly classified as negative, we also analyzed 10 random trials predicted correctly as negative by GPT-o1 and have provided the analysis in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Of these trials, only 10% (1/10) did not mention the primary end point for the whole trial population in its conclusions but, instead, reported the results of the primary end point in a positive subgroup [<xref ref-type="bibr" rid="ref17">17</xref>]. In total, 70% (7/10) of the trials explicitly mentioned that the primary end point failed to meet statistical significance or that the trial as a whole was negative or only mentioned the negative primary end point in their conclusions [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. A total of 20% (2/10) of the trials mentioned both the primary end point and secondary end points or subgroups [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we evaluated the ability of 3 commercial LLMs to classify oncology RCTs as positive or negative based on different sections of trial abstracts. Our findings demonstrate that modern LLMs, particularly more advanced models, can achieve high classification accuracy even when provided with limited information. Our findings also support the hypothesis that trials that are correctly classified as negative by an LLM when provided with the title and abstract but incorrectly classified as positive when provided with only the conclusions are likely to contain patterns that may be interpreted as spin. While there is no ground truth of what constitutes spin, not mentioning the results for the primary end point at all in the conclusions, mentioning an improvement in the primary end point that only occurred in a subgroup, or mixed reporting of primary and secondary end points without clear distinction would be at least considered questionable by many readers [<xref ref-type="bibr" rid="ref27">27</xref>]. Our findings also highlight that the LLM-based approach is not perfectly specific. In total, 20% (2/10) of the studies for which o1 was misled to believe they were positive when provided only with the conclusions had conclusions that clearly mentioned which arm had better outcomes. However, the LLM did not know which arm was the intervention and which arm was the control, so it assumed that the superior arm was the intervention arm. While this way of phrasing a conclusion may not be optimal for readability, it is certainly not an attempt at misleading the reader, who will still know which treatment yielded better results.</p><p>Therefore, our approach is likely not suitable as a fully automated solution. However, it demonstrated its potential to inform editors, reviewers, and authors alike of potential spin or unclear writing. The question of &#x201C;Are the results for the primary endpoint clearly recognizable in the conclusion?&#x201D; might serve as an alternative litmus test. Even though reviewers and journal editors are generally capable of recognizing questionable conclusions, we do believe that automated tools have value considering the ever-increasing list of items that have to be considered when conducting a careful review as they may, if implemented carefully, point toward parts of the manuscript that need increased attention. Another group of people who might benefit from a higher degree of automation are physicians who do not routinely read RCTs or have to do it in a situation in which they do not have time to fully digest all aspects of the research, such as in between patient consultations.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>While research on LLMs and spin is still in its infancy, Yun et al [<xref ref-type="bibr" rid="ref28">28</xref>] evaluated 22 LLMs and found that they are actually more susceptible to spin than humans. As LLMs are being used increasingly for screening and synthesizing scientific literature, this highlights the importance of improved detection of spin, preferably at the prepublication stage. However, the approach demonstrated in this study could also be leveraged as part of a screening pipeline to detect spin when trying to systematically analyze the literature in an automated fashion.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>This study has several strengths. The human annotation process was systematic, with independent dual review and consensus resolution, resulting in a reliable ground-truth dataset. Evaluating 3 LLMs of differing capability provided insights into how model complexity affects performance and sensitivity to unclear reporting. In addition, the structured comparison across 4 abstract conditions enabled us to isolate how specific sections of reporting contribute to misclassification.</p><p>This study has several limitations. First, the analysis was restricted to RCTs with 2 arms and a single primary end point. This constraint reduced complexity and helped ensure consistent interpretation but limits the applicability of our findings to trials with more complex designs, such as those involving multiple or co&#x2013;primary end points. As noted in this paper, such designs introduce additional analytic considerations, for example, prespecified alpha splitting, that would have increased methodological heterogeneity and potentially confounded the evaluation [<xref ref-type="bibr" rid="ref29">29</xref>]. Therefore, the restriction was deliberate, but it reduced generalizability.</p><p>Second, we did not include trials using analytical frameworks other than standard hypothesis testing, such as Bayesian designs [<xref ref-type="bibr" rid="ref30">30</xref>]. Because these studies report results differently and may emphasize posterior probabilities rather than traditional statistical significance, the performance of LLMs in such contexts remains unknown. This limitation reflects the scope of the feasibility study rather than an inherent barrier of the method.</p><p>Third, it is uncertain whether the models had previously encountered some of the included abstracts during training. If so, prior exposure could have artificially increased performance, particularly when models were presented with only part of an abstract. Although this possibility cannot be fully eliminated for proprietary language models, our key analyses focused on discrepancies between conclusion-only and full-abstract predictions. These discrepancies are less susceptible to prior knowledge because recognizing internal inconsistencies requires examining the relationship between sections rather than retrieving memorized text. Nonetheless, this limitation may have influenced overall performance metrics.</p><p>Fourth, this study used a single, clearly defined prompt that specified what should be considered a positive or negative trial. While this approach ensured consistent instructions across models and conditions, it remains possible that different prompting strategies would yield different results. The choice of a single explicit prompt was intended to minimize variability, but it may limit insight into how models behave under alternative or less directive task formulations.</p></sec><sec id="s4-4"><title>Future Directions</title><p>Future work could extend this approach to more complex trial designs, including studies with multiple or co&#x2013;primary end points, adaptive designs, or Bayesian frameworks, to determine whether LLM-based assessments remain reliable under conditions in which end point interpretation is less straightforward. Evaluating models from different vendors and open-source architectures may also help clarify how generalizable these findings are beyond the commercial systems examined in this study. In addition, refining prompting strategies or incorporating structured domain knowledge could improve model understanding of trial context, particularly in situations in which the distinction between intervention and control is not explicitly stated. Prospective integration of LLM-based screening tools into editorial workflows may help assess their practical utility in real-time manuscript evaluation. Finally, future studies may investigate whether LLMs can assist in promoting clearer reporting practices by providing automated feedback to authors during manuscript preparation.</p></sec><sec id="s4-5"><title>Conclusions</title><p>In conclusion, this study demonstrates that LLMs can highlight potential spin in oncology trial reporting by identifying inconsistencies between conclusions and full abstracts. These findings suggest a possible role for LLMs as supportive tools that draw attention to areas in which reporting may be unclear or incomplete. While not a substitute for expert review, such tools may help promote clearer communication of trial results. Further evaluation in more complex trial settings will be needed to determine how broadly this approach can be applied.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>No funding was received for this project.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available in the GitHub repository [<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: CK, PW</p><p>Data curation: CK, PW</p><p>Formal analysis: CK, PW</p><p>Methodology: CK, PW</p><p>Project administration: DMA, DRZ</p><p>Supervision: DRZ</p><p>Writing &#x2013; original draft: PW</p><p>Writing &#x2013; review &#x0026; editing: CK, FD, CS, DMA, RF, DRZ</p><p>All authors read and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb2">RCT</term><def><p>randomized controlled trial</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Booth</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Cescon</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tannock</surname><given-names>IF</given-names> </name><name name-style="western"><surname>Krzyzanowska</surname><given-names>MK</given-names> </name></person-group><article-title>Evolution of the randomized controlled trial in oncology over three decades</article-title><source>J Clin Oncol</source><year>2008</year><month>11</month><day>20</day><volume>26</volume><issue>33</issue><fpage>5458</fpage><lpage>5464</lpage><pub-id pub-id-type="doi">10.1200/JCO.2008.16.5456</pub-id><pub-id pub-id-type="medline">18955452</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahtani</surname><given-names>KR</given-names> </name></person-group><article-title>&#x201C;Spin&#x201D; in reports of clinical research</article-title><source>Evid Based Med</source><year>2016</year><month>12</month><volume>21</volume><issue>6</issue><fpage>201</fpage><lpage>202</lpage><pub-id pub-id-type="doi">10.1136/ebmed-2016-110570</pub-id><pub-id pub-id-type="medline">27737894</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boutron</surname><given-names>I</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Hopewell</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vera-Badillo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Tannock</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ravaud</surname><given-names>P</given-names> </name></person-group><article-title>Impact of spin in the abstracts of articles reporting results of randomized controlled trials in the field of cancer: the SPIIN randomized controlled trial</article-title><source>J Clin Oncol</source><year>2014</year><month>12</month><day>20</day><volume>32</volume><issue>36</issue><fpage>4120</fpage><lpage>4126</lpage><pub-id pub-id-type="doi">10.1200/JCO.2014.56.7503</pub-id><pub-id pub-id-type="medline">25403215</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ito</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hashimoto</surname><given-names>A</given-names> </name><name name-style="western"><surname>Uemura</surname><given-names>K</given-names> </name><name name-style="western"><surname>Oba</surname><given-names>K</given-names> </name></person-group><article-title>Misleading reporting (spin) in noninferiority randomized clinical trials in oncology with statistically not significant results: a systematic review</article-title><source>JAMA Netw Open</source><year>2021</year><month>12</month><day>1</day><volume>4</volume><issue>12</issue><fpage>e2135765</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.35765</pub-id><pub-id pub-id-type="medline">34874407</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wayant</surname><given-names>C</given-names> </name><name name-style="western"><surname>Margalski</surname><given-names>D</given-names> </name><name name-style="western"><surname>Vaughn</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vassar</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of spin in oncology clinical trials</article-title><source>Crit Rev Oncol Hematol</source><year>2019</year><month>12</month><volume>144</volume><fpage>102821</fpage><pub-id pub-id-type="doi">10.1016/j.critrevonc.2019.102821</pub-id><pub-id pub-id-type="medline">31733444</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Windisch</surname><given-names>P</given-names> </name><name name-style="western"><surname>Dennst&#x00E4;dt</surname><given-names>F</given-names> </name><name name-style="western"><surname>Koechli</surname><given-names>C</given-names> </name><etal/></person-group><article-title>The impact of temperature on extracting information from clinical trial publications using large language models</article-title><source>Cureus</source><year>2024</year><month>12</month><day>15</day><volume>16</volume><issue>12</issue><fpage>e75748</fpage><pub-id pub-id-type="doi">10.7759/cureus.75748</pub-id><pub-id pub-id-type="medline">39811231</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pujade-Lauraine</surname><given-names>E</given-names> </name><name name-style="western"><surname>Selle</surname><given-names>F</given-names> </name><name name-style="western"><surname>Weber</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Volasertib versus chemotherapy in platinum-resistant or -refractory ovarian cancer: a randomized phase II Groupe des Investigateurs Nationaux pour l&#x2019;Etude des Cancers de l&#x2019;Ovaire Study</article-title><source>J Clin Oncol</source><year>2016</year><month>03</month><day>1</day><volume>34</volume><issue>7</issue><fpage>706</fpage><lpage>713</lpage><pub-id pub-id-type="doi">10.1200/JCO.2015.62.1474</pub-id><pub-id pub-id-type="medline">26755507</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Leblond</surname><given-names>V</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chevret</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Results of a randomized trial of chlorambucil versus fludarabine for patients with untreated Waldenstr&#x00F6;m macroglobulinemia, marginal zone lymphoma, or lymphoplasmacytic lymphoma</article-title><source>J Clin Oncol</source><year>2013</year><month>01</month><day>20</day><volume>31</volume><issue>3</issue><fpage>301</fpage><lpage>307</lpage><pub-id pub-id-type="doi">10.1200/JCO.2012.44.7920</pub-id><pub-id pub-id-type="medline">23233721</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buske</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dimopoulos</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Grunenberg</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Bortezomib-dexamethasone, rituximab, and cyclophosphamide as first-line treatment for Waldenstr&#x00F6;m&#x2019;s macroglobulinemia: a prospectively randomized trial of the European Consortium for Waldenstr&#x00F6;m&#x2019;s macroglobulinemia</article-title><source>J Clin Oncol</source><year>2023</year><month>05</month><day>10</day><volume>41</volume><issue>14</issue><fpage>2607</fpage><lpage>2616</lpage><pub-id pub-id-type="doi">10.1200/JCO.22.01805</pub-id><pub-id pub-id-type="medline">36763945</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kabbinavar</surname><given-names>FF</given-names> </name><name name-style="western"><surname>Schulz</surname><given-names>J</given-names> </name><name name-style="western"><surname>McCleod</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Addition of bevacizumab to bolus fluorouracil and leucovorin in first-line metastatic colorectal cancer: results of a randomized phase II trial</article-title><source>J Clin Oncol</source><year>2005</year><month>06</month><day>1</day><volume>23</volume><issue>16</issue><fpage>3697</fpage><lpage>3705</lpage><pub-id pub-id-type="doi">10.1200/JCO.2005.05.112</pub-id><pub-id pub-id-type="medline">15738537</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Demetri</surname><given-names>GD</given-names> </name><name name-style="western"><surname>von Mehren</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>RL</given-names> </name><etal/></person-group><article-title>Efficacy and safety of trabectedin or dacarbazine for metastatic liposarcoma or leiomyosarcoma after failure of conventional chemotherapy: results of a phase III randomized multicenter clinical trial</article-title><source>J Clin Oncol</source><year>2016</year><month>03</month><day>10</day><volume>34</volume><issue>8</issue><fpage>786</fpage><lpage>793</lpage><pub-id pub-id-type="doi">10.1200/JCO.2015.62.4734</pub-id><pub-id pub-id-type="medline">26371143</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van de Velde</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Rea</surname><given-names>D</given-names> </name><name name-style="western"><surname>Seynaeve</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Adjuvant tamoxifen and exemestane in early breast cancer (TEAM): a randomised phase 3 trial</article-title><source>Lancet</source><year>2011</year><month>01</month><day>22</day><volume>377</volume><issue>9762</issue><fpage>321</fpage><lpage>331</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(10)62312-4</pub-id><pub-id pub-id-type="medline">21247627</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedikian</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Millward</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pehamberger</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Bcl-2 antisense (oblimersen sodium) plus dacarbazine in patients with advanced melanoma: the Oblimersen Melanoma Study Group</article-title><source>J Clin Oncol</source><year>2006</year><month>10</month><day>10</day><volume>24</volume><issue>29</issue><fpage>4738</fpage><lpage>4745</lpage><pub-id pub-id-type="doi">10.1200/JCO.2006.06.0483</pub-id><pub-id pub-id-type="medline">16966688</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fermand</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Katsahian</surname><given-names>S</given-names> </name><name name-style="western"><surname>Divine</surname><given-names>M</given-names> </name><etal/></person-group><article-title>High-dose therapy and autologous blood stem-cell transplantation compared with conventional treatment in myeloma patients aged 55 to 65 years: long-term results of a randomized control trial from the Group Myelome-Autogreffe</article-title><source>J Clin Oncol</source><year>2005</year><month>12</month><day>20</day><volume>23</volume><issue>36</issue><fpage>9227</fpage><lpage>9233</lpage><pub-id pub-id-type="doi">10.1200/JCO.2005.03.0551</pub-id><pub-id pub-id-type="medline">16275936</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peters</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dalle</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Locatelli</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Total body irradiation or chemotherapy conditioning in childhood ALL: a multinational, randomized, noninferiority phase III study</article-title><source>J Clin Oncol</source><year>2021</year><month>02</month><day>1</day><volume>39</volume><issue>4</issue><fpage>295</fpage><lpage>307</lpage><pub-id pub-id-type="doi">10.1200/JCO.20.02529</pub-id><pub-id pub-id-type="medline">33332189</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barrett-Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Casbard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abraham</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Oral ibandronic acid versus intravenous zoledronic acid in treatment of bone metastases from breast cancer: a randomised, open label, non-inferiority phase 3 trial</article-title><source>Lancet Oncol</source><year>2014</year><month>01</month><volume>15</volume><issue>1</issue><fpage>114</fpage><lpage>122</lpage><pub-id pub-id-type="doi">10.1016/S1470-2045(13)70539-4</pub-id><pub-id pub-id-type="medline">24332514</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hurwitz</surname><given-names>HI</given-names> </name><name name-style="western"><surname>Uppal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wagner</surname><given-names>SA</given-names> </name><etal/></person-group><article-title>Randomized, double-blind, phase II study of ruxolitinib or placebo in combination with capecitabine in patients with metastatic pancreatic cancer for whom therapy with gemcitabine has failed</article-title><source>J Clin Oncol</source><year>2015</year><month>12</month><day>1</day><volume>33</volume><issue>34</issue><fpage>4039</fpage><lpage>4047</lpage><pub-id pub-id-type="doi">10.1200/JCO.2015.61.4578</pub-id><pub-id pub-id-type="medline">26351344</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gregorc</surname><given-names>V</given-names> </name><name name-style="western"><surname>Gaafar</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Favaretto</surname><given-names>A</given-names> </name><etal/></person-group><article-title>NGR-hTNF in combination with best investigator choice in previously treated malignant pleural mesothelioma (NGR015): a randomised, double-blind, placebo-controlled phase 3 trial</article-title><source>Lancet Oncol</source><year>2018</year><month>06</month><volume>19</volume><issue>6</issue><fpage>799</fpage><lpage>811</lpage><pub-id pub-id-type="doi">10.1016/S1470-2045(18)30193-1</pub-id><pub-id pub-id-type="medline">29753703</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Small</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Schellhammer</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Higano</surname><given-names>CS</given-names> </name><etal/></person-group><article-title>Placebo-controlled phase III trial of immunologic therapy with sipuleucel-T (APC8015) in patients with metastatic, asymptomatic hormone refractory prostate cancer</article-title><source>J Clin Oncol</source><year>2006</year><month>07</month><day>1</day><volume>24</volume><issue>19</issue><fpage>3089</fpage><lpage>3094</lpage><pub-id pub-id-type="doi">10.1200/JCO.2005.04.5252</pub-id><pub-id pub-id-type="medline">16809734</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cobleigh</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Anderson</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Siziopikou</surname><given-names>KP</given-names> </name><etal/></person-group><article-title>Comparison of radiation with or without concurrent trastuzumab for HER2-positive ductal carcinoma in situ resected by lumpectomy: a phase III clinical trial</article-title><source>J Clin Oncol</source><year>2021</year><month>07</month><day>20</day><volume>39</volume><issue>21</issue><fpage>2367</fpage><lpage>2374</lpage><pub-id pub-id-type="doi">10.1200/JCO.20.02824</pub-id><pub-id pub-id-type="medline">33739848</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yap</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Kwok</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Syn</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Predictors of hand-foot syndrome and pyridoxine for prevention of capecitabine-induced hand-foot syndrome: a randomized clinical trial</article-title><source>JAMA Oncol</source><year>2017</year><month>11</month><day>1</day><volume>3</volume><issue>11</issue><fpage>1538</fpage><lpage>1545</lpage><pub-id pub-id-type="doi">10.1001/jamaoncol.2017.1269</pub-id><pub-id pub-id-type="medline">28715540</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pisansky</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Pugh</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Greenberg</surname><given-names>RE</given-names> </name><etal/></person-group><article-title>Tadalafil for prevention of erectile dysfunction after radiotherapy for prostate cancer: the Radiation Therapy Oncology Group [0831] randomized clinical trial</article-title><source>JAMA</source><year>2014</year><month>04</month><day>2</day><volume>311</volume><issue>13</issue><fpage>1300</fpage><lpage>1307</lpage><pub-id pub-id-type="doi">10.1001/jama.2014.2626</pub-id><pub-id pub-id-type="medline">24691606</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>GX</given-names> </name><etal/></person-group><article-title>Randomized phase III trial of induction chemotherapy with docetaxel, cisplatin, and fluorouracil followed by surgery versus up-front surgery in locally advanced resectable oral squamous cell carcinoma</article-title><source>J Clin Oncol</source><year>2013</year><month>02</month><day>20</day><volume>31</volume><issue>6</issue><fpage>744</fpage><lpage>751</lpage><pub-id pub-id-type="doi">10.1200/JCO.2012.43.8820</pub-id><pub-id pub-id-type="medline">23129742</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ray-Coquard</surname><given-names>I</given-names> </name><name name-style="western"><surname>Harter</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lorusso</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Effect of weekly paclitaxel with or without bevacizumab on progression-free rate among patients with relapsed ovarian sex cord-stromal tumors: the ALIENOR/ENGOT-ov7 randomized clinical trial</article-title><source>JAMA Oncol</source><year>2020</year><month>12</month><day>1</day><volume>6</volume><issue>12</issue><fpage>1923</fpage><lpage>1930</lpage><pub-id pub-id-type="doi">10.1001/jamaoncol.2020.4574</pub-id><pub-id pub-id-type="medline">33030515</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Butts</surname><given-names>C</given-names> </name><name name-style="western"><surname>Socinski</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>PL</given-names> </name><etal/></person-group><article-title>Tecemotide (L-BLP25) versus placebo after chemoradiotherapy for stage III non-small-cell lung cancer (START): a randomised, double-blind, phase 3 trial</article-title><source>Lancet Oncol</source><year>2014</year><month>01</month><volume>15</volume><issue>1</issue><fpage>59</fpage><lpage>68</lpage><pub-id pub-id-type="doi">10.1016/S1470-2045(13)70510-2</pub-id><pub-id pub-id-type="medline">24331154</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Chap</surname><given-names>LI</given-names> </name><name name-style="western"><surname>Holmes</surname><given-names>FA</given-names> </name><etal/></person-group><article-title>Randomized phase III trial of capecitabine compared with bevacizumab plus capecitabine in patients with previously treated metastatic breast cancer</article-title><source>J Clin Oncol</source><year>2005</year><month>02</month><day>1</day><volume>23</volume><issue>4</issue><fpage>792</fpage><lpage>799</lpage><pub-id pub-id-type="doi">10.1200/JCO.2005.05.098</pub-id><pub-id pub-id-type="medline">15681523</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reynolds-Vaughn</surname><given-names>V</given-names> </name><name name-style="western"><surname>Riddle</surname><given-names>J</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schiesel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wayant</surname><given-names>C</given-names> </name><name name-style="western"><surname>Vassar</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of spin in the abstracts of emergency medicine randomized controlled trials</article-title><source>Ann Emerg Med</source><year>2019</year><month>05</month><day>14</day><fpage>423</fpage><lpage>431</lpage><pub-id pub-id-type="doi">10.1016/j.annemergmed.2019.03.011</pub-id><pub-id pub-id-type="medline">31101371</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yun</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Kouzy</surname><given-names>R</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Li</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name></person-group><article-title>Caught in the web of words: do LLMs fall for spin in medical literature?</article-title><source>Proc Mach Learn Res</source><year>2025</year><month>06</month><volume>287</volume><fpage>458</fpage><lpage>479</lpage><pub-id pub-id-type="medline">41257216</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>Multiple endpoints in clinical trials: guidance for industry</article-title><source>US Food and Drug Administration</source><year>2022</year><access-date>2026-01-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/files/drugs/published/Multiple-Endpoints-in-Clinical-Trials-Guidance-for-Industry.pdf">https://www.fda.gov/files/drugs/published/Multiple-Endpoints-in-Clinical-Trials-Guidance-for-Industry.pdf</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van de Schoot</surname><given-names>R</given-names> </name><name name-style="western"><surname>Depaoli</surname><given-names>S</given-names> </name><name name-style="western"><surname>King</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Bayesian statistics and modelling</article-title><source>Nat Rev Methods Primers</source><year>2021</year><month>01</month><day>14</day><volume>1</volume><fpage>1</fpage><pub-id pub-id-type="doi">10.1038/s43586-020-00001-2</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>Windisch-paul/positive_negative</article-title><source>GitHub</source><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/windisch-paul/positive_negative">https://github.com/windisch-paul/positive_negative</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Analysis of 10 random trials predicted correctly as negative by GPT-o1.</p><media xlink:href="cancer_v12i1e78221_app1.pdf" xlink:title="PDF File, 64 KB"/></supplementary-material></app-group></back></article>