<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Cancer</journal-id><journal-id journal-id-type="publisher-id">cancer</journal-id><journal-id journal-id-type="index">21</journal-id><journal-title>JMIR Cancer</journal-title><abbrev-journal-title>JMIR Cancer</abbrev-journal-title><issn pub-type="epub">2369-1999</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e71102</article-id><article-id pub-id-type="doi">10.2196/71102</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Understanding Cancer Survivorship Care Needs Using Amazon Reviews: Content Analysis, Algorithm Development, and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Liwei</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lu</surname><given-names>Qiuhao</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Rui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Harrison</surname><given-names>Taylor B</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jia</surname><given-names>Heling</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>Ming</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dowst</surname><given-names>Heidi</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Rui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Badr</surname><given-names>Hoda</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fan</surname><given-names>Jungwei W</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Liu</surname><given-names>Hongfang</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Clinical and Health Informatics, McWilliams School of Biomedical Informatics, The University of Texas Health Science Center at Houston</institution><addr-line>7000 Fannin Street, Suite 600</addr-line><addr-line>Houston</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Health Data Science and Artificial Intelligence, McWilliams School of Biomedical Informatics, The University of Texas Health Science Center at Houston</institution><addr-line>Houston</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Artificial Intelligence and Informatics, Mayo Clinic</institution><addr-line>Rochester</addr-line><addr-line>MN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Bioinformatics and Computational Biology, University of Minnesota</institution><addr-line>Twin Cities</addr-line><addr-line>MN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Dan L Duncan Comprehensive Cancer Center, Baylor College of Medicine</institution><addr-line>Houston</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Surgery, University of Minnesota</institution><addr-line>Twin Cities</addr-line><addr-line>MN</addr-line><country>United States</country></aff><aff id="aff7"><institution>Department of Medical Oncology, Thomas Jefferson University</institution><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Cahill</surname><given-names>Naomi</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Elbattah</surname><given-names>Mahmoud</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Michailidis</surname><given-names>Panagiotis D</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Xunyu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hongfang Liu, PhD, Department of Clinical and Health Informatics, McWilliams School of Biomedical Informatics, The University of Texas Health Science Center at Houston, 7000 Fannin Street, Suite 600, Houston, TX, 77030, United States, 1 713-500-3900; <email>hongfang.liu@uth.tmc.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>23</day><month>9</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e71102</elocation-id><history><date date-type="received"><day>11</day><month>01</month><year>2025</year></date><date date-type="rev-recd"><day>14</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>19</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Liwei Wang, Qiuhao Lu, Rui Li, Taylor B Harrison, Heling Jia, Ming Huang, Heidi Dowst, Rui Zhang, Hoda Badr, Jungwei W Fan, Hongfang Liu. Originally published in JMIR Cancer (<ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org">https://cancer.jmir.org</ext-link>), 23.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cancer, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org/">https://cancer.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://cancer.jmir.org/2025/1/e71102"/><abstract><sec><title>Background</title><p>Complementary therapies are being increasingly used by cancer survivors. As a channel for customers to share their feelings, outcomes, and perceived knowledge about the products purchased from e-commerce platforms, Amazon consumer reviews are a valuable real-world data source for understanding cancer survivorship care needs.</p></sec><sec><title>Objective</title><p>In this study, we aimed to highlight the potential of using Amazon consumer reviews as a novel source for identifying cancer survivorship care needs, particularly related to symptom self-management. Specifically, we present a publicly available, manually annotated corpus derived from Amazon reviews of health-related products and develop baseline natural language processing models using deep learning and large language model (LLM) to demonstrate the usability of this dataset.</p></sec><sec sec-type="methods"><title>Methods</title><p>We preprocessed the Amazon review dataset to identify sentences with cancer mentions through a rule-based method and conducted content analysis including text feature analysis, sentiment analysis, topic modeling, cancer type, and symptom association analysis. We then designed an annotation guideline, targeting survivorship-relevant constructs. A total of 159 reviews were annotated, and baseline models were developed based on deep learning and large language model (LLM) for named entity recognition and text classification tasks.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 4703 sentences containing positive cancer mentions were identified, drawn from 3349 reviews associated with 2589 distinct products. The identified topics through topic modeling revealed meaningful insights into cancer symptom management and survivorship experiences. Examples included discussions of green tea use during chemotherapy, cancer prevention strategies, and product recommendations for breast cancer. Top 15 symptoms in reviews were also identified, with pain being the most frequent symptom, followed by inflammation, fatigue, etc. The annotation labels were designed to capture cancer types, indicated symptoms, and symptom management outcomes. The resulting annotation corpus contains 2067 labels from 159 Amazon reviews. It is publicly accessible, together with the annotation guideline through the Open Health Natural Language Processing (OHNLP) GitHub. Our baseline model, Bert-base-cased, achieved the highest weighted average <italic>F</italic><sub>1</sub>-score, that is, 66.92%, for named entity recognition, and LLM gpt4-1106-preview-chat achieved the highest <italic>F</italic><sub>1</sub>-score for text classification tasks, that is, 66.67% for &#x201C;Harmful outcome,&#x201D; 88.46% for &#x201C;Favorable outcome&#x201D; and 73.33% for &#x201C;Ambiguous outcome.&#x201D;</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our results demonstrate the potential of Amazon consumer reviews as a novel data source for identifying persistent symptoms, concerns, and self-management strategies among cancer survivors. This corpus, along with the baseline natural language processing models developed for named entity recognition and text classification, lays the groundwork for future methodological advancements in cancer survivorship research. Importantly, insights from this study could be evaluated against established clinical guidelines for symptom management in cancer survivorship care. By revealing the feasibility of using consumer-generated data for mining survivorship-related experiences, this study offers a promising foundation for future research and argumentation analysis aimed at improving long-term outcomes and support for cancer survivors.</p></sec></abstract><kwd-group><kwd>real-world data</kwd><kwd>cancer research</kwd><kwd>cancer survivorship care</kwd><kwd>natural language processing</kwd><kwd>annotation</kwd><kwd>baseline models</kwd><kwd>deep learning</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The treatment of cancer results in unintended side effects and outcomes including pain, fatigue, weakness, anorexia, constipation, anxiety, dyspnea, nausea, and vomiting. These symptoms may emerge during active treatment and frequently persist into the posttreatment phase, necessitating continued monitoring and support. The National Cancer Institute defines cancer survivorship care as beginning at cancer diagnosis and continuing through the remainder of a patient&#x2019;s life, encompassing both medical and supportive care needs across the continuum of care [<xref ref-type="bibr" rid="ref1">1</xref>]. Recognizing its importance, the Institute of Medicine, the National Cancer Institute, and the American Society of Clinical Oncologists have increasingly prioritized survivorship care as a critical component of efforts to improve long-term cancer outcomes and quality of life [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Cancer survivorship care extends well beyond surveillance for recurrence or secondary malignancies. According to the National Cancer Institute&#x2019;s National Standards for Cancer Survivorship Care, high-quality survivorship care should address several key focus areas: communication and coordination of care, prevention and surveillance of new or recurrent cancers, symptom management and supportive care, and provision of practical resources to help survivors navigate life after treatment [<xref ref-type="bibr" rid="ref2">2</xref>]. These standards underscore the need for a comprehensive, patient-centered approach that supports cancer survivors during and after the transition out of active oncology care. Addressing ongoing needs&#x2014;such as managing long-term and late effects of treatment, promoting healthy behaviors, and ensuring access to psychosocial support&#x2014;is critical to optimizing survivors&#x2019; quality of life, particularly during periods of reduced clinical oversight.</p><p>Digital platforms such as social media and forums have emerged as important spaces where cancer survivors seek support, share experiences, and access health information [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. These platforms also offer a rich, untapped source of real-world data for researchers seeking to understand the survivorship experience from the patient&#x2019;s perspective. Advances in large learning models such as OpenAI, Gemini, and LLAMA have also made it increasingly feasible to process and extract insights from these large, unstructured text datasets.</p><p>One such promising resource is Amazon&#x2019;s consumer product review system. As a widely used e-commerce platform with national reach&#x2014;including rural and underserved areas&#x2014;Amazon provides consumers with the opportunity to share detailed reflections on their experiences with health and wellness products. These reviews often contain personal narratives about symptom self-management, perceived product effectiveness, and emotional responses, which may be particularly relevant for understanding the needs of cancer survivors. Theoretically, language content (eg, the percentage of words related to a topic)&#x2014;such as the proportion of words associated with specific symptoms or outcomes&#x2014;can reflect an individual&#x2019;s focus and meaning-making processes [<xref ref-type="bibr" rid="ref7">7</xref>]. Analyzing large-scale consumer-review data can thus offer insights into consumer knowledge, attitudes, and behaviors from a population health perspective [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Furthermore, the structured format of Amazon reviews, coupled with the inclusion of product use experiences and verified purchase indicators, enhances their value as a real-world data source for health care studies.</p><p>To process the large volume of unstructured text generated by consumers, text mining and natural language processing (NLP) techniques are essential for extracting meaningful patterns and insights. For example, previous applications of NLP in health forums have successfully extracted clinically relevant information such as treatment types, medication names, and side effects from cancer-related user-generated content [<xref ref-type="bibr" rid="ref10">10</xref>]. Health NLP&#x2014;an interdisciplinary field that integrates computational linguistics with health care&#x2014;has received growing attention in recent years [<xref ref-type="bibr" rid="ref11">11</xref>], leading to the development of a range of NLP tools and systems. One such platform, Open Health NLP (OHNLP), provides open-source clinical NLP software that facilitates large-scale analysis of free-text health data [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Despite the growing application of NLP in health care, existing studies using Amazon reviews as data sources for health-related insights have largely focused on noncancer domains such as erectile dysfunction and testosterone supplements [<xref ref-type="bibr" rid="ref15">15</xref>], eye health [<xref ref-type="bibr" rid="ref16">16</xref>], and chronic pain [<xref ref-type="bibr" rid="ref17">17</xref>]. These studies demonstrate the feasibility of extracting product-related health experiences from consumer reviews. They also underscore a critical gap: there has been limited exploration of how individuals affected by cancer&#x2014;particularly survivors&#x2014;use Amazon to share experiences with complementary therapies, long-term symptoms, and navigate posttreatment concerns. Understanding these patterns is crucial for addressing the informational and self-management needs of cancer survivors.</p><p>Amazon product reviews represent a novel and largely untapped data resource for exploring cancer symptom management from the survivor&#x2019;s perspective. These reviews may reveal implicit information about how survivors respond to persistent symptoms, evaluate over-the-counter and complementary therapies, and seek support outside of traditional health care settings. Annotated corpora are critical for training and evaluating NLP algorithms that can reliably identify these patterns. However, existing Amazon review datasets have been developed primarily for sentiment analysis [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>], and there is a lack of manually annotated cancer-specific corpora that focus on survivorship-related constructs.</p><p>In this study, we aim to address this gap by evaluating the potential of Amazon consumer reviews to surface cancer survivors&#x2019; ongoing concerns, persistent symptoms, and unmet needs. Specifically, we (1) present a publicly available, manually annotated corpus derived from Amazon reviews of health-related products, and (2) develop baseline NLP models using deep learning and large language model (LLM) approaches to demonstrate the usability of this dataset. These tools provide the foundation for future research that leverages consumer-generated data to inform survivorship interventions and improve long-term outcomes for cancer survivors.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Source</title><p>We used the preprocessed dataset of Health &#x0026; Personal Care category containing reviews and metadata from Amazon between May 1996-July 2014 [<xref ref-type="bibr" rid="ref21">21</xref>]. This dataset has been deduplicated, consisting of 2,982,326 reviews and 263,032 metadata. Review data includes reviewer ID, the Amazon Standard Identification Number (ASIN) which Amazon uses to identify products, reviewer name, helpfulness of rating, review text, overall rating (1&#x2010;5 stars), summary of review, and review time. Metadata of the reviews includes ASIN, title, price, image URL, what items the customer also bought, what items the customer also viewed, what items the customer bought together, sales rank, brand, and categories. ASIN is the primary key to link review text and metadata.</p></sec><sec id="s2-2"><title>Study Design</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows the study design. Multiple methodologies have been developed to identify named entities in texts, that is, machine learning, deep learning, hybrid, and rule-based methods [<xref ref-type="bibr" rid="ref22">22</xref>]. In the first step, we used a rule-based method to identify a set of review texts with cancer mentions for a high-level content analysis. We then created an annotated corpus from the set of review texts and developed baseline NLP models, including deep learning and LLM, for named entity recognition (NER) and text classification.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study design. DL: deep learning; LLM: large language models; NER: named entity recognition.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e71102_fig01.png"/></fig></sec><sec id="s2-3"><title>Data Preprocessing With Rule-Based NLP Method</title><p>To identify the reviews with cancer mentions, we prepared a cancer dictionary based on the cancer branch of the Disease Ontology. It includes cell type cancer and organ system cancer integrated from different terminologies and vocabularies including the Catalog of Somatic Mutations in Cancer, The Cancer Genome Atlas, International Cancer Genome Consortium, Therapeutically Applicable Research to Generate Effective Treatments, Integrative Oncogenomics, and the Early Detection Research Network [<xref ref-type="bibr" rid="ref23">23</xref>]. In total, there are 4343 cancer term variants corresponding to 1535 cancer concepts. The cancer terms were prepared into the symbolic lexicon format compatible with the Open Health Natural Language Processing (OHNLP) Toolkit&#x2019;s NLP engine MedTagger [<xref ref-type="bibr" rid="ref24">24</xref>]. The open-source clinical NLP pipeline analyzed review texts and identified cancer-related medical concepts along with the assertion status of the cancer concept including certainty (ie, positive, negative, and hypothetical and possible). We kept only positive cancer concept mentions for further analysis.</p></sec><sec id="s2-4"><title>Content Analysis</title><p>We summarized the features of texts containing sentences with cancer mentions, conducted sentiment analysis, topic modeling, and visualization of cancer types and symptoms association for the review sentences with cancer mentions to gain insights into the prevailing themes and mood surrounding discussions related to cancer within the dataset.</p><sec id="s2-4-1"><title>Text Feature Analysis</title><p>To understand the text features of review data, we performed text complexity analysis to summarize review texts containing the sentences with cancer mention, including number of review texts, number of sentences, and number of words. For comparison purposes, the above metrics were also calculated for the entire collection of reviews from the Health &#x0026; Personal Care category.</p></sec><sec id="s2-4-2"><title>Sentiment Analysis</title><p>Bert-base-multilingual-uncased-sentiment [<xref ref-type="bibr" rid="ref25">25</xref>] is a fine-tuned model from a bertbase-multilingual-uncased model for sentiment analysis on product reviews in 6 languages including English. Based on 5000 held-out product reviews for English, the accuracy (exact), that is, exact match for the number of stars is 67%. Accuracy (off-by-1), that is, the percentage of reviews where the number of stars the model predicts differs by a maximum of 1 from the number given by the human reviewer is 95%. The fine-tuned model was used for sentiment analysis of review sentences with cancer term mentions. This model predicts the sentiment of input text as a number of stars (between 1 and 5). The higher the sentiment score, the more positive the review. The lack of context has been one major challenge in sentiment analysis that can affect the interpretation of sentiment [<xref ref-type="bibr" rid="ref26">26</xref>]. We consider that identifying customer attitudes based on the sentence containing cancer mentions instead of the whole review text can be better constructive in understanding consumers&#x2019; efficacy and safety perceptions. The sentiment of the review sentences with cancer mentions detected by Medtagger was further analyzed to identify positive or negative attitudes toward the product [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. We analyzed the distribution of sentiment scores across the review sentences with cancer mentions, and the trend of average sentiment score between 1996 and 2014.</p></sec><sec id="s2-4-3"><title>Topic Modeling</title><p>We used a sentence embedding model (ie, bge-small-en) [<xref ref-type="bibr" rid="ref29">29</xref>] to transform the textual content of reviews into numerical embeddings. These embeddings capture the semantic essence of each document in a high-dimensional space. We then applied UMAP (Uniform Manifold Approximation and Projection) [<xref ref-type="bibr" rid="ref30">30</xref>] to the embeddings for dimensionality reduction. This step is crucial for visualization, as it converts high-dimensional data into a 2-dimensional format suitable for plotting. The core of the analysis is performed by BERTopic [<xref ref-type="bibr" rid="ref31">31</xref>], a model that identifies distinct topics within the text data. BERTopic relies on sub-models for embeddings (provided by SentenceTransformer of bge-small-en), dimensionality reduction (UMAP), and hierarchical clustering (HDBSCAN) [<xref ref-type="bibr" rid="ref32">32</xref>]. In addition, a quantized LLM (ie, openhermes-2.5-mistral-7b) [<xref ref-type="bibr" rid="ref33">33</xref>] is incorporated for topic label generation. After fitting the data to the BERTopic model, topics are extracted along with their probabilities. Each topic is then assigned a label generated by the LLM based on a predefined prompt. These labels are designed to be concise, with a maximum of 5 words, and describe the essence of the documents within each topic.</p><p>The chosen sentences were preprocessed by removing stop-words, special characters, and numbers and removing sentences with pets (dog, cat, etc). We detected topics based on all sentences with cancer mentions, as well as the sentences from 5 sentiment score groups. We then visualized the results.</p></sec><sec id="s2-4-4"><title>Cancer Type and Symptom Association</title><p>To explore the relationship between reported cancer types and symptoms, we constructed bipartite graphs based on co-occurrence patterns in the chosen sentences. Symptom mentions were identified using a state-of-the-art LLM for NER, specifically, the UniversalNER-7b-all model, which was applied via a 0-shot strategy. We then calculated the frequency of cancer type-symptom co-occurrences to generate a set of unique pairs. Each bipartite graph consisted of 2 node sets&#x2014;cancer and symptoms&#x2014;with edges indicating their association frequency. Node placement was optimized to ensure even distribution and visual clarity within each group. Edge widths were normalized and scaled to reflect the relative frequency of each cancer type-symptom pair, allowing for a visual representation of the strength of association between nodes.</p></sec></sec><sec id="s2-5"><title>Development of Gold Standards and Baseline Models</title><sec id="s2-5-1"><title>Gold Standard Creation</title><p>We developed an annotation guideline (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) to support the systematic labeling of target data elements and their associated class and type designations from Amazon customer reviews. The guideline was designed to be concise in order to minimize annotators&#x2019; cognitive load while ensuring consistency and enabling the annotated dataset&#x2019;s future use for information extraction tasks. The schema of annotated labels is presented in Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Target concepts included cancer type, indicated symptoms, favorable outcome, harmful outcome, and product, with each concept having class or type options.</p><p>The cancer type concept has the class of either human or pet. The indicated symptoms, favorable outcome, and harmful outcome concepts were further categorized as either cancer-related or other, while the product concept was labeled as itself or other. In addition to class and type assignments, we also annotated each cancer type, indicated symptom, favorable outcome, and harmful outcome instance with one of 4 levels of certainty: positive, negative, hypothetical, or possible. For example, in the sentence: &#x201C;I&#x2019;ve had salivary gland cancer,&#x201D; the phrase &#x201C;salivary gland cancer&#x201D; was annotated as the cancer type concept, with the human class and positive certainty. In contrast, the phrase &#x201C;might prevent cancer&#x201D; in the sentence: &#x201C;Some people say it might prevent cancer,&#x201D; was annotated as a favorable outcome concept, with the cancer-related class and hypothetical certainty.</p><p>MedTator [<xref ref-type="bibr" rid="ref13">13</xref>], a free and open-source annotation tool, was used to perform the annotation task. Two annotators with backgrounds in medicine and informatics were first trained to annotate following the annotation guideline. After initial training, inter-annotator agreement (IAA) was assessed during the process. Once the annotators achieved a high level of agreement (<italic>F</italic><sub>1</sub>-score &#x2265;0.9), they proceeded to independently annotate the review texts. Discrepancies were resolved through an adjudication process involving discussion and consensus, resulting in a finalized gold standard corpus.</p><p>A total of 200 review texts containing cancer-related mentions, identified from the first step, were randomly selected for annotation. During the annotation process, we focused on reviews reflecting customer perspectives and excluded those summarizing books or other nonproduct-related content. This yielded a final sample of 159 consumer-generated reviews that were chosen for annotation.</p></sec><sec id="s2-5-2"><title>Development of Baseline Models</title><p>The annotated dataset was used to develop baseline models for 2 NLP tasks: NER and text classification. The goal of the NER task was to identify and classify entities mentioned in consumer reviews, specifically focusing on cancer types, indicated symptoms, and product mentions. For model development, we restricted annotations to human cancer types and cancer-related symptoms. The cancer type entity category included specific diagnoses such as &#x201C;breast cancer,&#x201D; &#x201C;leukemia,&#x201D; &#x201C;lymphoma,&#x201D; and &#x201C;melanoma,&#x201D; and only entities annotated with a positive certainty value were included. The indicated symptoms category captured phrases that suggested the condition or symptom the product was used to address (eg, &#x201C;affected her eye&#x201D; in the sentence &#x201C;She had cancer that affected her eye&#x201D;). The product entity included both direct product mentions and anaphoric references (eg, &#x201C;this&#x201D;).</p><p>For the text classification task, review excerpts were categorized into one of 3 outcome classes based on product impact on cancer-related conditions: favorable, harmful, or ambiguous. Favorable outcomes are comments where the product is noted to positively affect a cancer-related condition. Harmful outcomes are comments indicating a negative impact on cancer-related conditions. Ambiguous outcomes include comments with possible and hypothetical certainties, reflecting the speculative nature of the feedback.</p><p>We developed 2 types of baseline models for the NER and text classification tasks. The first used supervised fine-tuning (SFT) of BERT-like models, with 2 different classification heads on top, that is, token classification and sequence classification, respectively. We evaluated the performance of 2 widely used BERT-like models: bert-base-cased and Bio_ClinicalBERT. The second type of baseline was based on an LLM approach using the gpt4-1106-preview-chat model. We prompted the model to perform both tasks under varying in-context learning conditions: zero-shot, few-shot (using 5 examples), and many-shot (using all available training examples) [<xref ref-type="bibr" rid="ref34">34</xref>]. The prompts used for NER and text classification are included in Appendix File 3. For both NER and text classification tasks, we partitioned the annotated dataset using an 80-20 train-test split, with 80% of the data used for training and 20% reserved for evaluation.</p></sec></sec><sec id="s2-6"><title>Ethical Considerations</title><p>The data used in the study were publicly available [<xref ref-type="bibr" rid="ref21">21</xref>]. As Amazon customer reviews typically do not contain personally identifiable information and we used the public dataset; therefore, personally identifiable information was not a concern in this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Content Analysis</title><p>A total of 4703 sentences containing positive cancer mentions were identified, drawn from 3349 reviews associated with 2589 distinct products. These cancer-related reviews contained a total of 26,078 sentences and 500,087 words, with an average length of 149.3 words per review. For comparison, the broader Health &#x0026; Personal Care category comprised 2,982,326 reviews, totaling 10,469,336 sentences and 199,501,964 words, with an average of 66.9 words per review. <xref ref-type="table" rid="table1">Table 1</xref> shows the distribution of product categories with reviews that include cancer-related mentions.</p><p>Table S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> shows the distribution of sentiment scores across the review sentences with cancer mentions, where scores 1 and 5 prevailed as the top sentiments. Temporarily, there were increasing trends for the average sentiment score of the sentences with cancer mentions from 2004 to 2014 before and after a dip around 2008.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of product categories in review sentences of cancer mentions.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Product categories</td><td align="left" valign="bottom">Number of review sentences with cancer mentions</td></tr></thead><tbody><tr><td align="left" valign="top">Health &#x0026; personal care</td><td align="left" valign="top">4703</td></tr><tr><td align="left" valign="top">&#x2003;Vitamins &#x0026; dietary supplements</td><td align="left" valign="top">2758</td></tr><tr><td align="left" valign="top">&#x2003;Health care</td><td align="left" valign="top">675</td></tr><tr><td align="left" valign="top">&#x2003;Personal care</td><td align="left" valign="top">361</td></tr><tr><td align="left" valign="top">&#x2003;Sports nutrition</td><td align="left" valign="top">283</td></tr><tr><td align="left" valign="top">&#x2003;Medical supplies &#x0026; equipment</td><td align="left" valign="top">268</td></tr><tr><td align="left" valign="top">&#x2003;Household supplies</td><td align="left" valign="top">99</td></tr><tr><td align="left" valign="top">&#x2003;Sexual wellness</td><td align="left" valign="top">45</td></tr></tbody></table></table-wrap><p><xref ref-type="fig" rid="figure2">Figure 2</xref> shows the results of topic modeling applied to all sentences containing cancer mentions extracted using the dictionary method. The identified topics revealed meaningful insights into cancer symptom management and survivorship experiences. Examples included discussions of green tea use during chemotherapy, cancer prevention strategies, product recommendations for breast cancer; post-treatment oral health issues, and antioxidant effects on tumor vasculature. To further examine the thematic structure, we conducted hierarchical clustering of these topics, as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>. Cluster labels were generated using GPT-4o (prompting instructions are shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>), resulting in the following high-level themes: General Cancer Concerns &#x0026; Alternative Health; Environmental &#x0026; Chemical Cancer Risks; Cancer Research &#x0026; Alternative Treatments; Scientific Studies &#x0026; Genetic Factors; Cancer Survivorship &#x0026; Treatment Journeys; Cancer Prevention &#x0026; Supplementation and Cancer Support, Symptoms &#x0026; General Health. For example, the General Cancer Concerns &#x0026; Alternative Health cluster includes discussions related to cancer-related fears and disease progression, while the Cancer Support, Symptoms &#x0026; General Health cluster captures narratives related to pain management, lymph node involvement, and lymphedema. To quantify the distribution of content across these clusters, we further prompted GPT-4o to assign each sentence derived from topic modeling into one of the 7 clusters using an in-context prompt (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). As shown in Table S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, the cluster Cancer Support, Symptoms &#x0026; General Health accounted for the largest proportion of sentences, followed by Cancer Prevention &#x0026; Supplementation.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Topic modeling with BERTopic based on all sentences with cancer mentions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e71102_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Hierarchical clustering of topics based on all sentences with cancer mentions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e71102_fig03.png"/></fig><p>Figures S1-S6 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> display the results of hierarchical clustering and topic modeling conducted separately for review sentences grouped by sentiment score 1, 2, 3, 4, and 5. This stratified analysis revealed notable differences in thematic content across sentiment groups. For example, sentiment score group 1 (most negative) contained a higher concentration of topics related to cancer risks, including concerns about carcinogenic ingredients, California Proposition 65 warning labels, artificial sweeteners, product safety, and general ingredient toxicity. In contrast, sentiment score group 5 (most positive) featured a greater number of topics highlighting perceived benefits for cancer-related conditions. These included the purported benefits of iodine for thyroid health, calcium vitamin supplementation for bone health, flaxseed as a complementary therapy, narratives of cancer survivorship and thriving, use of sleep aids during cancer treatment, and various anticancer supplements used by survivors.</p><p><xref ref-type="fig" rid="figure4">Figure 4</xref> shows the bipartite graphs of cancer types with symptoms. The bipartite graph is used to show the association between cancer types and symptoms instead of causal relations in a sentence. The edge between cancer types and symptoms represented the frequency of the cancer type-symptom pairs, showing the strength of each association. Zero-shot LLM extracted detailed symptoms, such as pain, inflammation, fatigue, constipation, etc. Results showed associations between stomach cancer and reflux, breast cancer and menstrual cramps, bone cancer and pain, etc.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Bipartite graph of cancer types and symptoms extracted by large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e71102_fig04.png"/></fig><p>Top 15 symptoms in reviews were also identified, with pain being the most frequent symptom, followed by inflammation, fatigue, hot flashes, dry mouth, constipation, cancer sores, nausea, insomnia, neuropathy, lymphedema, incontinence, diarrhea, bloating, fever, and night sweats.</p></sec><sec id="s3-2"><title>Annotation Corpus</title><p><xref ref-type="table" rid="table2">Table 2</xref> shows the statistics for the resulting annotated corpus for each concept and associated classes (type) and certainties. In total, 2067 labels were generated from 159 reviews. Table S4 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> shows the inter-annotator agreements for each concept annotation, with the overall inter-annotator agreement being 0.86. IAA for cancer type is the highest (0.97), and harmful outcome is the lowest (0.63). The annotated corpus is publicly accessible through the OHNLP Github [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref35">35</xref>].</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Statistics of the resulting annotated corpus.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Concepts and class (type)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="4">Certainty</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Positive</td><td align="left" valign="top">Negative</td><td align="left" valign="top">Hypothetical</td><td align="left" valign="top">Possible</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Human</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">131</td><td align="left" valign="top">9</td><td align="left" valign="top">100</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top" colspan="5">Pet</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="char" char="." valign="top">18</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">0</td></tr><tr><td align="left" valign="top">Cancer_related</td><td align="left" valign="top" colspan="4"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">105</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">16</td><td align="left" valign="top">0</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">145</td><td align="left" valign="top">0</td><td align="left" valign="top">51</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top" colspan="5">Other</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="char" char="." valign="top">80</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="char" char="." valign="top">23</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="char" char="." valign="top">242</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">15</td><td align="char" char="." valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>There were 1,015 labels for Product (itself) and 98 for Product (other).</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Baseline Model Development</title><p>In our study, the annotated data is used for two distinctive NLP tasks, ie, named entity recognition and text classification. The dataset for the NER task included 1054 annotated samples, with 80% (843 samples) used for training the model and 20% (211 samples) designated for testing its accuracy. For text classification, the dataset consists of 218 annotated samples, with 80% (174 samples) allocated for training the model and 20% (44 samples) reserved for testing its accuracy. <xref ref-type="table" rid="table3">Table 3</xref> shows the statistics of annotation entity labels for model development.</p><p><xref ref-type="table" rid="table4">Table 4</xref> shows the performance of Bert-base-cased, Bio_ClinicalBERT, and gpt4-1106-preview-chat in NER. In general, bert-like models outperformed LLM, with 0.6692 weighted average <italic>F</italic><sub>1</sub>-score for bert-base-cased, 0.6558 for Bio_ClinicalBERT, and the best performance of gpt4-1106-preview-chat was 0.5077 weighted average <italic>F</italic><sub>1</sub>-score through many-shot strategy. Among the 3 entities, &#x201C;indicated symptom&#x201D; showed consistent lower performance across all baseline models compared with the other two entities, that is, &#x201C;cancer type&#x201D; and &#x201C;product,&#x201D; implying the difficulty of extracting this entity.</p><p><xref ref-type="table" rid="table5">Table 5</xref> shows the performance of baseline models in text classification. The performance of LLM gpt4-1106-preview-chat using many-shot strategy exceeded bert-like models. Specifically, the performance of bert-base-cased and Bio_ClinicalBERT in classifying &#x201C;Harmful outcome&#x201D; was zero. This could be explained by the limited number, that is, 16, of &#x201C;Harmful outcome&#x201D; labels in the gold standard. In addition, the IAA of harmful outcome is the lowest during annotation, implying that &#x201C;Harmful outcome&#x201D; classification is the most difficult classification task among all. In contrast, LLM excelled in the scenario of the limited labels, achieving the highest <italic>F</italic><sub>1</sub>-score for the 3 classes, that is, 0.6667 for &#x201C;Harmful outcome,&#x201D; 0.8846 for &#x201C;Favorable outcome,&#x201D; and 0.7333 for &#x201C;Ambiguous outcome.&#x201D;</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Statistics of annotation entity labels for model development.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Task and target</td><td align="left" valign="bottom">Criteria</td><td align="left" valign="bottom">Number label</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">NER<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">Human, positive</td><td align="left" valign="top">131</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">Cancer_related, positive</td><td align="left" valign="top">105</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Product</td><td align="left" valign="top">Itself</td><td align="left" valign="top">1015</td></tr><tr><td align="left" valign="top" colspan="3">Text classification</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">Cancer_related, positive</td><td align="left" valign="top">145</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">Cancer_related, positive</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambiguous_outcome</td><td align="left" valign="top">Cancer_related, hypothetical and possible</td><td align="left" valign="top">57</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>NER: named entity recognition.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance of baseline models in named entity recognition.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model, learning strategy, and entity</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Standard error (<italic>F</italic><sub>1</sub>)</td><td align="left" valign="bottom">Lower CI (<italic>F</italic><sub>1</sub>)</td><td align="left" valign="bottom">Upper CI (<italic>F</italic><sub>1</sub>)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Bert-base-cased</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SFT<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">0.5366</td><td align="left" valign="top">0.6286</td><td align="left" valign="top">0.5789</td><td align="left" valign="top">0.0493</td><td align="left" valign="top">0.4821</td><td align="left" valign="top">0.6756</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">0.1667</td><td align="left" valign="top">0.1429</td><td align="left" valign="top">0.1538</td><td align="left" valign="top">0.0360</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.2245</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Product</td><td align="left" valign="top">0.6773</td><td align="left" valign="top">0.7161</td><td align="left" valign="top">0.6962</td><td align="left" valign="top">0.0459</td><td align="left" valign="top">0.6060</td><td align="left" valign="top">0.7863</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Micro average</td><td align="left" valign="top">0.6514</td><td align="left" valign="top">0.6905</td><td align="left" valign="top">0.6704</td><td align="left" valign="top">0.047</td><td align="left" valign="top">0.5782</td><td align="left" valign="top">0.7625</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro average</td><td align="left" valign="top">0.4602</td><td align="left" valign="top">0.4959</td><td align="left" valign="top">0.4763</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.3784</td><td align="left" valign="top">0.5741</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weighted average</td><td align="left" valign="top">0.6495</td><td align="left" valign="top">0.6905</td><td align="left" valign="top">0.6692</td><td align="left" valign="top">0.0470</td><td align="left" valign="top">0.5769</td><td align="left" valign="top">0.7614</td></tr><tr><td align="left" valign="top" colspan="7">Bio_ClinicalBERT</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SFT</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">0.5349</td><td align="left" valign="top">0.697</td><td align="left" valign="top">0.6053</td><td align="left" valign="top">0.0489</td><td align="left" valign="top">0.5094</td><td align="left" valign="top">0.7011</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">0.3000</td><td align="left" valign="top">0.2143</td><td align="left" valign="top">0.2500</td><td align="left" valign="top">0.0433</td><td align="left" valign="top">0.1651</td><td align="left" valign="top">0.3348</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Product</td><td align="left" valign="top">0.695</td><td align="left" valign="top">0.6583</td><td align="left" valign="top">0.6762</td><td align="left" valign="top">0.0468</td><td align="left" valign="top">0.5844</td><td align="left" valign="top">0.7679</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Micro average</td><td align="left" valign="top">0.6675</td><td align="left" valign="top">0.6462</td><td align="left" valign="top">0.6567</td><td align="left" valign="top">0.0474</td><td align="left" valign="top">0.5636</td><td align="left" valign="top">0.75</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro average</td><td align="left" valign="top">0.5100</td><td align="left" valign="top">0.5232</td><td align="left" valign="top">0.5105</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.4125</td><td align="left" valign="top">0.6084</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weighted average</td><td align="left" valign="top">0.6684</td><td align="left" valign="top">0.6462</td><td align="left" valign="top">0.6558</td><td align="left" valign="top">0.0475</td><td align="left" valign="top">0.5626</td><td align="left" valign="top">0.7489</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Zero-shot</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">0.2885</td><td align="left" valign="top">0.6818</td><td align="left" valign="top">0.4054</td><td align="left" valign="top">0.0490</td><td align="left" valign="top">0.3091</td><td align="left" valign="top">0.5016</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">0.0759</td><td align="left" valign="top">0.4615</td><td align="left" valign="top">0.1304</td><td align="left" valign="top">0.0336</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.1964</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Product</td><td align="left" valign="top">0.3529</td><td align="left" valign="top">0.3243</td><td align="left" valign="top">0.338</td><td align="left" valign="top">0.0473</td><td align="left" valign="top">0.2452</td><td align="left" valign="top">0.4307</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Micro average</td><td align="left" valign="top">0.2776</td><td align="left" valign="top">0.3619</td><td align="left" valign="top">0.3142</td><td align="left" valign="top">0.0464</td><td align="left" valign="top">0.2232</td><td align="left" valign="top">0.4051</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro average</td><td align="left" valign="top">0.2391</td><td align="left" valign="top">0.4892</td><td align="left" valign="top">0.2913</td><td align="left" valign="top">0.0454</td><td align="left" valign="top">0.2022</td><td align="left" valign="top">0.3803</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weighted average</td><td align="left" valign="top">0.3334</td><td align="left" valign="top">0.3619</td><td align="left" valign="top">0.3333</td><td align="left" valign="top">0.0471</td><td align="left" valign="top">0.2409</td><td align="left" valign="top">0.4256</td></tr><tr><td align="left" valign="top" colspan="7">gpt4-1106-preview-chat</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Few-shot</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">0.3148</td><td align="left" valign="top">0.7727</td><td align="left" valign="top">0.4474</td><td align="left" valign="top">0.0497</td><td align="left" valign="top">0.3499</td><td align="left" valign="top">0.5448</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">0.0536</td><td align="left" valign="top">0.2308</td><td align="left" valign="top">0.087</td><td align="left" valign="top">0.0281</td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.1422</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Product</td><td align="left" valign="top">0.4743</td><td align="left" valign="top">0.5405</td><td align="left" valign="top">0.5053</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.4073</td><td align="left" valign="top">0.6032</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Micro average</td><td align="left" valign="top">0.3857</td><td align="left" valign="top">0.5447</td><td align="left" valign="top">0.4516</td><td align="left" valign="top">0.0498</td><td align="left" valign="top">0.3540</td><td align="left" valign="top">0.5491</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro average</td><td align="left" valign="top">0.2809</td><td align="left" valign="top">0.5147</td><td align="left" valign="top">0.3465</td><td align="left" valign="top">0.0476</td><td align="left" valign="top">0.2532</td><td align="left" valign="top">0.4397</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weighted average</td><td align="left" valign="top">0.4394</td><td align="left" valign="top">0.5447</td><td align="left" valign="top">0.4791</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.3811</td><td align="left" valign="top">0.5770</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Many-shot</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cancer_type</td><td align="left" valign="top">0.4000</td><td align="left" valign="top">0.6364</td><td align="left" valign="top">0.4912</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.3932</td><td align="left" valign="top">0.589</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Indicated_symptom</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Product</td><td align="left" valign="top">0.5672</td><td align="left" valign="top">0.5135</td><td align="left" valign="top">0.539</td><td align="left" valign="top">0.0498</td><td align="left" valign="top">0.44</td><td align="left" valign="top">0.6367</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Micro average</td><td align="left" valign="top">0.5079</td><td align="left" valign="top">0.4981</td><td align="left" valign="top">0.5029</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.4049</td><td align="left" valign="top">0.601</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro average</td><td align="left" valign="top">0.3224</td><td align="left" valign="top">0.3833</td><td align="left" valign="top">0.3434</td><td align="left" valign="top">0.0474</td><td align="left" valign="top">0.2503</td><td align="left" valign="top">0.4364</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weighted average</td><td align="left" valign="top">0.5242</td><td align="left" valign="top">0.4981</td><td align="left" valign="top">0.5077</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.4097</td><td align="left" valign="top">0.6056</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SFT: supervised fine-tuning.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Performance of baseline models in text classification.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model, learning strategy, and sentiment</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Standard error (F1)</td><td align="left" valign="bottom">Lower CI (F1)</td><td align="left" valign="bottom">Upper CI (F1)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Bert-base-cased</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SFT<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">0.6470</td><td align="left" valign="top">0.8800</td><td align="left" valign="top">0.7457</td><td align="left" valign="top">0.0435</td><td align="left" valign="top">0.6603</td><td align="left" valign="top">0.8310</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambiguous_outcome</td><td align="left" valign="top">0.7000</td><td align="left" valign="top">0.4375</td><td align="left" valign="top">0.5384</td><td align="left" valign="top">0.0498</td><td align="left" valign="top">0.4406</td><td align="left" valign="top">0.6361</td></tr><tr><td align="left" valign="top" colspan="7">Bio_ClinicalBERT</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SFT</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">0.6486</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.7741</td><td align="left" valign="top">0.0418</td><td align="left" valign="top">0.6921</td><td align="left" valign="top">0.8560</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambiguous_outcome</td><td align="left" valign="top">0.8571</td><td align="left" valign="top">0.375</td><td align="left" valign="top">0.5217</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.4237</td><td align="left" valign="top">0.6196</td></tr><tr><td align="left" valign="top">gpt4-1106-preview-chat</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Zero-shot</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.0471</td><td align="left" valign="top">0.5743</td><td align="left" valign="top">0.7590</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">0.7368</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.6364</td><td align="left" valign="top">0.0481</td><td align="left" valign="top">0.5421</td><td align="left" valign="top">0.7306</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambiguous_outcome</td><td align="left" valign="top">0.4545</td><td align="left" valign="top">0.625</td><td align="left" valign="top">0.5263</td><td align="left" valign="top">0.0499</td><td align="left" valign="top">0.4284</td><td align="left" valign="top">0.6241</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Few-shot</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">0.5</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.5714</td><td align="left" valign="top">0.0494</td><td align="left" valign="top">0.4744</td><td align="left" valign="top">0.6683</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">0.6429</td><td align="left" valign="top">0.72</td><td align="left" valign="top">0.6792</td><td align="left" valign="top">&#x2003;0.0466</td><td align="left" valign="top">0.5877</td><td align="left" valign="top">0.7706</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambiguous_outcome</td><td align="left" valign="top">0.3333</td><td align="left" valign="top">0.25</td><td align="left" valign="top">0.2857</td><td align="left" valign="top">0.0451</td><td align="left" valign="top">0.1971</td><td align="left" valign="top">0.3742</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Many-shot</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Harmful_outcome</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.0471</td><td align="left" valign="top">0.5743</td><td align="left" valign="top">0.7590</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Favorable_outcome</td><td align="left" valign="top">0.8519</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.8846</td><td align="left" valign="top">0.0319</td><td align="left" valign="top">0.8219</td><td align="left" valign="top">0.9472</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambiguous_outcome</td><td align="left" valign="top">0.7857</td><td align="left" valign="top">0.6875</td><td align="left" valign="top">0.7333</td><td align="left" valign="top">0.0442</td><td align="left" valign="top">0.6466</td><td align="left" valign="top">0.8199</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>SFT: supervised fine-tuning.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Complementary therapies are increasingly used by cancer survivors to manage persistent symptoms and long-term side effects. Among breast cancer patients, for example, dietary supplement use has been reported in 67% to 87% of cases [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. However, complementary therapies are often not integrated into routine oncology care, and clinical research evaluating their effectiveness remains limited. One contributing factor is that many patients do not disclose their use of such therapies to providers, creating a significant gap in understanding how survivors self-manage their health outside clinical settings [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>In this study, we explored the potential of Amazon consumer reviews as a novel data source for capturing survivor-reported experiences with symptom management and complementary therapy use. Through content analysis, we identified several dimensions of cancer survivorship care reflected in these reviews.</p><p>First, topic clustering revealed meaningful subgroups related to survivorship experiences, including discussions of protein powders, cancer-related weight loss, breast cancer and estrogen receptor status, and vitamins for future cancer prevention.</p><p>Second, sentiment-stratified analysis revealed that reviews with lower sentiment scores more often focused on cancer-related risks (eg, toxic ingredients and product harms), while those with higher scores more often highlighted perceived benefits of supplements and other supportive products for managing cancer-related symptoms (Figures S1-S6 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). These patterns provide insights into how survivors interpret and evaluate complementary therapies in relation to their health and recovery.</p><p>Third, associations between specific cancer types (identified via the rule-based dictionary method), and symptoms (identified using zero-shot LLM methods) surfaced detailed symptom management experiences, including pain, fatigue, and gastrointestinal symptoms. Notably, the top 15 symptoms included in these reviews reflect common survivorship challenges [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>], with pain being the most frequent. Fourth, while the zero-shot LLM was not formally evaluated in this content analysis, it was effective in identifying symptom-related language at scale, which enabled exploratory symptom mapping across cancer types. These findings illustrate how publicly available consumer data can offer valuable insight into the lived experiences of survivors and their efforts to manage persistent symptoms using accessible, over-the-counter, or complementary therapies.</p><p>Beyond content analysis, our key contributions include the development of a manually annotated dataset with 159 reviews and baseline NLP models for NER and text classification. This resource was intentionally designed to capture nuanced mentions of cancer types (eg, &#x201C;cancer in his bone&#x201D;) and survivor-reported outcomes, laying the groundwork for future applications in survivorship research. These annotations provide a foundation for fine-grained analysis of survivor narratives and outcomes related to self-management of cancer and treatment-related side effects.</p><p>The baseline models demonstrated promising performance. The bert-base-cased model achieved the highest weighted average <italic>F</italic><sub>1</sub>-score for NER, while gpt4-1106-preview-chat achieved the highest <italic>F</italic><sub>1</sub>-scores across all text classification tasks. These results suggest that LLMs, while currently limited in NER performance [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], are highly effective for text classification. For instance, even in the limited Harmful outcome category (n=16), GPT-4 was able to generalize and achieve an <italic>F</italic><sub>1</sub>-score of 0.6667 in the zero-shot setting, compared with an <italic>F</italic><sub>1</sub>-score of 0 for fine-tuned BERT/Bio_ClinicalBERT models. This highlights the potential of LLMs for use in survivorship-related classification tasks where annotated data may be limited. To address the low performance of the NER task, data augmentation through synthetic data generation or fine-tuning models with a larger training dataset can be used.</p></sec><sec id="s4-2"><title>Limitations</title><p>Our study has several limitations. First, the sentiment analysis component is constrained by the domain dependence of existing pretrained models [<xref ref-type="bibr" rid="ref44">44</xref>]. While many general-purpose language models can classify sentiment, few are trained specifically on health or cancer-related content. To enable a high-level analysis of emotional tone, we used an existing sentiment model, which achieved 67% exact match accuracy between predicted sentiment and the number of stars assigned to product reviews containing cancer mentions. However, this approach occasionally produced mismatches. For instance, the sentence &#x201C;My husband took this for early stage CLL and after 9 months is in remission.&#x201D; was assigned a sentiment score of 1 (negative), despite clearly expressing a positive outcome. This misalignment reflects the complexity of interpreting sentiment in survivorship contexts, where emotional tone may be influenced by both product experience and the reviewer&#x2019;s cancer journey. As such, sentiment scores may not consistently reflect product efficacy or survivor satisfaction. In addition, consumer reviews are inherently subjective and may reflect social influence biases (eg, other reviews) [<xref ref-type="bibr" rid="ref45">45</xref>], or may come from users who are not representative of the broader survivor population [<xref ref-type="bibr" rid="ref46">46</xref>]. Despite these limitations, sentiment analysis helped highlight broad differences in topics across emotional tone. In future work, we plan to fine-tune domain-specific sentiment models trained on health-related and survivorship-specific data to improve classification accuracy and interpretability.</p><p>Second, although the overall IAA was high (<italic>F</italic><sub>1</sub>=0.86), the IAA for the &#x201C;Harmful outcomes&#x201D; category was considerably lower (<italic>F</italic><sub>1</sub>=0.63). This is likely attributable to the small number of annotated instances (n=16), which may have contributed to reduced consistency. However, given the clinical and survivorship importance of identifying adverse outcomes, this remains a critical category. Future annotation efforts will involve expanded training, guideline refinement, and targeted oversampling of underrepresented classes to improve reliability.</p><p>Third, the dataset used for this study includes Amazon reviews posted between May 1996 and July 2014. Consumer behaviors, complementary therapy trends, and survivorship care practices have likely evolved in the past decade. This temporal limitation may restrict the contemporary relevance of some findings, particularly in light of recent shifts toward integrative oncology and growing digital health engagement among survivors. Fourth, our manually annotated dataset comprises only 159 reviews. While this proof-of-concept sample enabled initial model development and feasibility testing, the limited sample size constrains the generalizability and robustness of the resulting models. Ongoing annotation work will expand the dataset, with careful attention to balancing reviews across outcome types and sentiment categories to support more comprehensive model training.</p><p>Notably, a new version of the Amazon review dataset&#x2014;spanning May 1996 to Sep 2023&#x2014;has recently been released and is 245.2% larger than the version used in this study [<xref ref-type="bibr" rid="ref47">47</xref>]. Future work will leverage the expanded dataset to scale annotation efforts, develop more robust models, and generate updated insights into cancer symptom management and complementary therapy use among cancer survivors. These analyses could also support regulatory efforts and health care interventions by highlighting potential product risks and unmet survivor needs reflected in real-world consumer narratives.</p></sec><sec id="s4-3"><title>Conclusion</title><p>Our results demonstrate the potential of Amazon consumer reviews as a novel data source for identifying persistent symptoms, concerns, and self-management strategies among cancer survivors. We presented the design and implementation of a publicly accessible, manually annotated corpus available through the OHNLP GitHub focused on cancer type, symptoms, and symptom management outcomes. This corpus, along with the baseline NLP models developed for named entity recognition and text classification, lays the groundwork for future methodological advancements in cancer survivorship research. Importantly, insights derived from this study could be evaluated in relation to established clinical guidelines for symptom management in cancer survivorship care (eg, American Society of Clinical Oncologists and National Comprehensive Cancer Network). Such comparisons may help validate survivor-reported outcomes, reveal novel survivor concerns not routinely captured in clinical care settings, and inform the development of more patient-centered care models. By revealing the feasibility of using consumer-generated data for mining survivorship-related experiences, this study offers a promising foundation for future research and argumentation analysis aimed at improving long-term outcomes and support for cancer survivors.</p></sec></sec></body><back><ack><p>Research reported in this publication was supported by the National Library of Medicine under award number R01LM011934, the National Human Genome Research Institute under award number R01HG012748, the National Institute of Aging R01AG072799, the Cancer Prevention Institute of Texas (CPRIT) under award number RR230020, and National Center for Complementary and Integrative Health under award number 2R01AT009457. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Library of Medicine, the National Human Genome Research Institute, the National Institute of Aging, the National Library of Medicine, the National Center for Complementary and Integrative Health, or the State of Texas. We used the generative AI tool GPT-4o to generate Cluster labels in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p></ack><notes><sec><title>Data Availability</title><p>The annotated dataset is available at the OHNLP GitHub [<xref ref-type="bibr" rid="ref35">35</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>LW conceptualized and designed the study, designed annotation guideline, developed NLP models, analyzed the data, and drafted the manuscript. QL designed the study, developed NLP models, analyzed the data and drafted the manuscript. RL analyzed the data and revised the manuscript. TBH designed annotation guideline, conducted annotation and revised the manuscript. HJ designed annotation guideline and conducted annotation. MH advised on the study design, and revised the manuscript. HD revised the manuscript. RZ revised the manuscript. HB revised the manuscript. JWF advised on the study design and revised the manuscript. HL conceptualized and designed the study, and revised the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ASIN</term><def><p>Amazon Standard Identification Number</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">NER</term><def><p>named entity recognition</p></def></def-item><def-item><term id="abb4">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb5">OHNLP</term><def><p>open health natural language processing</p></def></def-item><def-item><term id="abb6">SFT</term><def><p>supervised fine-tuning</p></def></def-item><def-item><term id="abb7">UMAP</term><def><p>uniform manifold approximation and projection</p></def></def-item><def-item><term id="abb8">IAA</term><def><p>inter-annotator agreement</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mazza</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Palladini</surname><given-names>M</given-names> </name><name name-style="western"><surname>De Lorenzo</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Persistent psychopathology and neurocognitive impairment in COVID-19 survivors: effect of inflammatory biomarkers at three-month follow-up</article-title><source>Brain Behav Immun</source><year>2021</year><month>05</month><volume>94</volume><issue>138-147</issue><fpage>138</fpage><lpage>147</lpage><pub-id pub-id-type="doi">10.1016/j.bbi.2021.02.021</pub-id><pub-id pub-id-type="medline">33639239</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blaes</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Adamson</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Foxhall</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bhatia</surname><given-names>S</given-names> </name></person-group><article-title>Survivorship care plans and the commission on cancer standards: the increasing need for better strategies to improve the outcome for survivors of cancer</article-title><source>JCO Oncol Pract</source><year>2020</year><month>08</month><volume>16</volume><issue>8</issue><fpage>447</fpage><lpage>450</lpage><pub-id pub-id-type="doi">10.1200/JOP.19.00801</pub-id><pub-id pub-id-type="medline">32267803</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mollica</surname><given-names>MA</given-names> </name><name name-style="western"><surname>McWhirter</surname><given-names>G</given-names> </name><name name-style="western"><surname>Tonorezos</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Developing national cancer survivorship standards to inform quality of care in the United States using a consensus approach</article-title><source>J Cancer Surviv</source><year>2024</year><month>08</month><volume>18</volume><issue>4</issue><fpage>1190</fpage><lpage>1199</lpage><pub-id pub-id-type="doi">10.1007/s11764-024-01602-6</pub-id><pub-id pub-id-type="medline">38739299</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Stovall</surname><given-names>E</given-names> </name><name name-style="western"><surname>Greenfield</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hewitt</surname><given-names>M</given-names> </name></person-group><source>From Cancer Patient to Cancer Survivor: Lost in Transition</source><year>2005</year><publisher-name>National Academies Press</publisher-name><pub-id pub-id-type="other">978-0-309-09595-2</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Attai</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Cowher</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Al-Hamadani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Schoger</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Staley</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Landercasper</surname><given-names>J</given-names> </name></person-group><article-title>Twitter social media is an effective tool for breast cancer patient education and support: patient-reported outcomes by survey</article-title><source>J Med Internet Res</source><year>2015</year><month>07</month><day>30</day><volume>17</volume><issue>7</issue><fpage>e188</fpage><pub-id pub-id-type="doi">10.2196/jmir.4721</pub-id><pub-id pub-id-type="medline">26228234</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Attai</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Dizon</surname><given-names>DS</given-names> </name></person-group><article-title>Social media and oncology: the time is now</article-title><source>JCO Oncol Pract</source><year>2022</year><month>08</month><volume>18</volume><issue>8</issue><fpage>525</fpage><lpage>527</lpage><pub-id pub-id-type="doi">10.1200/OP.21.00820</pub-id><pub-id pub-id-type="medline">35015573</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spitzley</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Burgoon</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Dunbar</surname><given-names>NE</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>S</given-names> </name></person-group><article-title>Linguistic measures of personality in group discussions</article-title><source>Front Psychol</source><year>2022</year><volume>13</volume><issue>887616</issue><fpage>887616</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2022.887616</pub-id><pub-id pub-id-type="medline">36186305</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eysenbach</surname><given-names>G</given-names> </name></person-group><article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the Internet</article-title><source>J Med Internet Res</source><year>2009</year><month>03</month><day>27</day><volume>11</volume><issue>1</issue><fpage>e11</fpage><pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id><pub-id pub-id-type="medline">19329408</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eysenbach</surname><given-names>G</given-names> </name></person-group><article-title>Infodemiology and infoveillance tracking online health information and cyberbehavior for public health</article-title><source>Am J Prev Med</source><year>2011</year><month>05</month><volume>40</volume><issue>5 Suppl 2</issue><fpage>S154</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2011.02.006</pub-id><pub-id pub-id-type="medline">21521589</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sutar</surname><given-names>SG</given-names> </name></person-group><article-title>Intelligent data mining technique of social media for improving health care</article-title><conf-name>2017 International Conference on Intelligent Computing and Control Systems (ICICCS)</conf-name><conf-date>Jun 15-16, 2017</conf-date><conf-loc>Madurai, India</conf-loc><fpage>1356</fpage><lpage>1360</lpage><pub-id pub-id-type="doi">10.1109/ICCONS.2017.8250690</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>B</given-names> </name></person-group><article-title>Health natural language processing: methodology development and applications</article-title><source>JMIR Med Inform</source><year>2021</year><month>10</month><day>21</day><volume>9</volume><issue>10</issue><fpage>e23898</fpage><pub-id pub-id-type="doi">10.2196/23898</pub-id><pub-id pub-id-type="medline">34673533</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Acquisition of a lexicon for family history information: bidirectional encoder representations from transformers-assisted sublanguage analysis</article-title><source>JMIR Med Inform</source><year>2023</year><month>06</month><day>27</day><volume>11</volume><fpage>e48072</fpage><pub-id pub-id-type="doi">10.2196/48072</pub-id><pub-id pub-id-type="medline">37368483</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>MedTator: a serverless annotation tool for corpus development</article-title><source>Bioinformatics</source><year>2022</year><month>03</month><day>4</day><volume>38</volume><issue>6</issue><fpage>1776</fpage><lpage>1778</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btab880</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>Wonder at chemotimelines 2024: medtimeline: an end-to-end NLP system for timeline extraction from clinical narratives</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><fpage>483</fpage><lpage>487</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.48</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Balasubramanian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thirumavalavan</surname><given-names>N</given-names> </name><name name-style="western"><surname>Srivatsav</surname><given-names>A</given-names> </name><etal/></person-group><article-title>An analysis of popular online erectile dysfunction supplements</article-title><source>J Sex Med</source><year>2019</year><month>06</month><volume>16</volume><issue>6</issue><fpage>843</fpage><lpage>852</lpage><pub-id pub-id-type="doi">10.1016/j.jsxm.2019.03.269</pub-id><pub-id pub-id-type="medline">31036522</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alsoudi</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Loya</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abouodah</surname><given-names>H</given-names> </name><name name-style="western"><surname>Koo</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rahimy</surname><given-names>E</given-names> </name></person-group><article-title>An evaluation of popular online eye health products on Amazon marketplace</article-title><source>Ophthalmic Surg Lasers Imaging Retina</source><year>2023</year><month>03</month><volume>54</volume><issue>3</issue><fpage>147</fpage><lpage>152</lpage><pub-id pub-id-type="doi">10.3928/23258160-20230221-03</pub-id><pub-id pub-id-type="medline">36944073</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fan</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hooten</surname><given-names>WM</given-names> </name></person-group><article-title>Retrospective content analysis of consumer product reviews related to chronic pain</article-title><source>Front Digit Health</source><year>2023</year><volume>5</volume><issue>958338</issue><fpage>958338</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2023.958338</pub-id><pub-id pub-id-type="medline">37168528</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name></person-group><article-title>Mining and summarizing customer reviews</article-title><conf-name>KDD '04: Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining</conf-name><conf-date>Aug 22, 2004</conf-date><conf-loc>Seattle WA USA</conf-loc><fpage>168</fpage><lpage>177</lpage><pub-id pub-id-type="doi">10.1145/1014052.1014073</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ding</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>PS</given-names> </name></person-group><article-title>A holistic lexicon-based approach to opinion mining</article-title><conf-name>WSDM &#x2019;08: Proceedings of the 2008 International Conference on Web Search and Data Mining</conf-name><conf-date>Feb 11, 2008</conf-date><conf-loc>Palo Alto, California, USA</conf-loc><fpage>231</fpage><lpage>240</lpage><pub-id pub-id-type="doi">10.1145/1341531.1341561</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Boland</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wira-Alam</surname><given-names>A</given-names> </name><name name-style="western"><surname>Messerschmidt</surname><given-names>R</given-names> </name></person-group><article-title>Creating an annotated corpus for sentiment analysis of German product reviews</article-title><year>2013</year><publisher-name>Social Science Open Access Repository</publisher-name></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>R</given-names> </name><name name-style="western"><surname>McAuley</surname><given-names>J</given-names> </name></person-group><article-title>Ups and downs: modeling the visual evolution of fashion trends with one-class collaborative filtering</article-title><conf-name>WWW &#x2019;16: Proceedings of the 25th International Conference on World Wide Web</conf-name><conf-date>Apr 11, 2016</conf-date><conf-loc>Qu&#x00E9;bec, Montr&#x00E9;al, Canada</conf-loc><fpage>507</fpage><lpage>517</lpage><pub-id pub-id-type="doi">10.1145/2872427.2883037</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rastegar-Mojarad</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Clinical information extraction applications: a literature review</article-title><source>J Biomed Inform</source><year>2018</year><month>01</month><volume>77</volume><issue>34-49</issue><fpage>34</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2017.11.011</pub-id><pub-id pub-id-type="medline">29162496</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Schriml</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>QR</given-names> </name><etal/></person-group><article-title>Generating a focused view of disease ontology cancer terms for pan-cancer data integration and analysis</article-title><source>Database (Oxford)</source><year>2015</year><fpage>bav032</fpage><pub-id pub-id-type="doi">10.1093/database/bav032</pub-id><pub-id pub-id-type="medline">25841438</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bielinski</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Sohn</surname><given-names>S</given-names> </name><etal/></person-group><article-title>An information extraction framework for cohort identification using electronic health records</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2013</year><volume>2013</volume><issue>149</issue><fpage>149</fpage><lpage>153</lpage><pub-id pub-id-type="medline">24303255</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>nlptown/bert-base-multilingual-uncased-sentiment</article-title><source>Huggingface</source><access-date>2024-03-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment">https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Radha</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bhuvaneswari</surname><given-names>NS</given-names> </name></person-group><article-title>Optimizing sentiment analysis of Amazon product reviews using a sophisticated fish swarm optimization-guided radial basis function neural network (Sfso-Rbfnn)</article-title><source>J Theor Appl Inf Technol</source><year>2023</year><volume>101</volume><issue>11</issue><comment><ext-link ext-link-type="uri" xlink:href="https://www.jatit.org/volumes/Vol101No11/17Vol101No11.pdf">https://www.jatit.org/volumes/Vol101No11/17Vol101No11.pdf</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adams</surname><given-names>DZ</given-names> </name><name name-style="western"><surname>Gruss</surname><given-names>R</given-names> </name><name name-style="western"><surname>Abrahams</surname><given-names>AS</given-names> </name></person-group><article-title>Automated discovery of safety and efficacy concerns for joint &#x0026; muscle pain relief treatments from online reviews</article-title><source>Int J Med Inform</source><year>2017</year><month>04</month><volume>100</volume><issue>108-120</issue><fpage>108</fpage><lpage>120</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2017.01.005</pub-id><pub-id pub-id-type="medline">28241932</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Babu</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Kanaga</surname><given-names>EGM</given-names> </name></person-group><article-title>Sentiment analysis in social media data for depression detection using artificial intelligence: a review</article-title><source>SN Comput Sci</source><year>2022</year><volume>3</volume><issue>1</issue><fpage>74</fpage><pub-id pub-id-type="doi">10.1007/s42979-021-00958-1</pub-id><pub-id pub-id-type="medline">34816124</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Muennighof</surname><given-names>N</given-names> </name></person-group><article-title>C-pack: packaged resources to advance general Chinese embedding</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.07597</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>McInnes</surname><given-names>L</given-names> </name><name name-style="western"><surname>Healy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Melville</surname><given-names>J</given-names> </name></person-group><article-title>Umap: uniform manifold approximation and projection for dimension reduction</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 9, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1802.03426</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grootendorst</surname><given-names>M</given-names> </name></person-group><article-title>BERTopic: neural topic modeling with a class-based TF-IDF procedure</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 11, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.05794</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McInnes</surname><given-names>L</given-names> </name><name name-style="western"><surname>Healy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Astels</surname><given-names>S</given-names> </name></person-group><article-title>hdbscan: Hierarchical density based clustering</article-title><source>J Open Source Softw</source><year>2017</year><volume>2</volume><issue>11</issue><fpage>205</fpage><pub-id pub-id-type="doi">10.21105/joss.00205</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.06825</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Agarwal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>LM</given-names> </name><etal/></person-group><article-title>Many-shot in-context learning</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.11018</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>OHNLP Amazon review annotation</article-title><source>GitHub</source><access-date>2024-12-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/OHNLP/Amazon-review-annotation">https://github.com/OHNLP/Amazon-review-annotation</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Velicer</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Ulrich</surname><given-names>CM</given-names> </name></person-group><article-title>Vitamin and mineral supplement use among US adults after cancer diagnosis: a systematic review</article-title><source>J Clin Oncol</source><year>2008</year><month>02</month><day>1</day><volume>26</volume><issue>4</issue><fpage>665</fpage><lpage>673</lpage><pub-id pub-id-type="doi">10.1200/JCO.2007.13.5905</pub-id><pub-id pub-id-type="medline">18235127</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kwan</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Weltzien</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kushi</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Castillo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Slattery</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Caan</surname><given-names>BJ</given-names> </name></person-group><article-title>Dietary patterns and breast cancer recurrence and survival among women with early-stage breast cancer</article-title><source>J Clin Oncol</source><year>2009</year><month>02</month><day>20</day><volume>27</volume><issue>6</issue><fpage>919</fpage><lpage>926</lpage><pub-id pub-id-type="doi">10.1200/JCO.2008.19.4035</pub-id><pub-id pub-id-type="medline">19114692</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eisenberg</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Ettner</surname><given-names>SL</given-names> </name><etal/></person-group><article-title>Trends in alternative medicine use in the United States, 1990-1997: results of a follow-up national survey</article-title><source>JAMA</source><year>1998</year><month>11</month><day>11</day><volume>280</volume><issue>18</issue><fpage>1569</fpage><lpage>1575</lpage><pub-id pub-id-type="doi">10.1001/jama.280.18.1569</pub-id><pub-id pub-id-type="medline">9820257</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frenkel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ben-Arye</surname><given-names>E</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Sierpina</surname><given-names>V</given-names> </name></person-group><article-title>Approach to communicating with patients about the use of nutritional supplements in cancer care</article-title><source>South Med J</source><year>2005</year><month>03</month><volume>98</volume><issue>3</issue><fpage>289</fpage><lpage>294</lpage><pub-id pub-id-type="doi">10.1097/01.SMJ.0000154776.71057.E8</pub-id><pub-id pub-id-type="medline">15813155</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Green</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Hart-Johnson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Loeffler</surname><given-names>DR</given-names> </name></person-group><article-title>Cancer-related chronic pain: examining quality of life in diverse cancer survivors</article-title><source>Cancer</source><year>2011</year><month>05</month><day>1</day><volume>117</volume><issue>9</issue><fpage>1994</fpage><lpage>2003</lpage><pub-id pub-id-type="doi">10.1002/cncr.25761</pub-id><pub-id pub-id-type="medline">21509777</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Harden</surname><given-names>JK</given-names> </name></person-group><article-title>Symptom burden and quality of life in survivorship: a review of the literature</article-title><source>Cancer Nurs</source><year>2015</year><volume>38</volume><issue>1</issue><fpage>E29</fpage><lpage>54</lpage><pub-id pub-id-type="doi">10.1097/NCC.0000000000000135</pub-id><pub-id pub-id-type="medline">24831042</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>Large language models struggle in token-level clinical named entity recognition</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.00731</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Pang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>L</given-names> </name></person-group><article-title>Opinion mining and sentiment analysis</article-title><source>Foundations and Trends&#x00AE; in Information Retrieval</source><year>2008</year><volume>2</volume><fpage>1</fpage><lpage>135</lpage><pub-id pub-id-type="doi">10.1561/9781601981516</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muchnik</surname><given-names>L</given-names> </name><name name-style="western"><surname>Aral</surname><given-names>S</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>SJ</given-names> </name></person-group><article-title>Social influence bias: a randomized experiment</article-title><source>Science</source><year>2013</year><month>08</month><day>9</day><volume>341</volume><issue>6146</issue><fpage>647</fpage><lpage>651</lpage><pub-id pub-id-type="doi">10.1126/science.1240466</pub-id><pub-id pub-id-type="medline">23929980</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Langhe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Fernbach</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Lichtenstein</surname><given-names>DR</given-names> </name></person-group><article-title>Navigating by the stars: investigating the actual and perceived validity of online user ratings</article-title><source>J Consum Res</source><year>2016</year><month>04</month><day>1</day><volume>42</volume><issue>6</issue><fpage>817</fpage><lpage>833</lpage><pub-id pub-id-type="doi">10.1093/jcr/ucv047</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>McAuley</surname><given-names>J</given-names> </name></person-group><article-title>Bridging language and items for retrieval and recommendation</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.03952</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Amazon review - annotation guidelines.</p><media xlink:href="cancer_v11i1e71102_app1.docx" xlink:title="DOCX File, 778 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Tables showing schema of the annotated labels, distribution of sentiment scores across the sentences with cancer mentions, number of sentences corresponding to each cluster, and inter-annotator agreements.</p><media xlink:href="cancer_v11i1e71102_app2.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Large language model prompting instruction.</p><media xlink:href="cancer_v11i1e71102_app3.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Figures depicting hierarchical clustering and topic modeling.</p><media xlink:href="cancer_v11i1e71102_app4.docx" xlink:title="DOCX File, 2203 KB"/></supplementary-material></app-group></back></article>