<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Cancer</journal-id><journal-id journal-id-type="publisher-id">cancer</journal-id><journal-id journal-id-type="index">21</journal-id><journal-title>JMIR Cancer</journal-title><abbrev-journal-title>JMIR Cancer</abbrev-journal-title><issn pub-type="epub">2369-1999</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e62833</article-id><article-id pub-id-type="doi">10.2196/62833</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Association Between Risk Factors and Major Cancers: Explainable Machine Learning Approach</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>Xiayuan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ren</surname><given-names>Shushun</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mao</surname><given-names>Xinyue</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Sirui</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Elle</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>He</surname><given-names>Yuqi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Jiang</surname><given-names>Yun</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biostatistics, Yale University</institution><addr-line>New Haven</addr-line><addr-line>CT</addr-line><country>United States</country></aff><aff id="aff2"><institution>School of Nursing, University of Michigan&#x2013;Ann Arbor</institution><addr-line>400 North Ingalls Street</addr-line><addr-line>Ann Arbor</addr-line><addr-line>MI</addr-line><country>United States</country></aff><aff id="aff3"><institution>College of Literature Science and the Arts, University of Michigan&#x2013;Ann Arbor</institution><addr-line>Ann Arbor</addr-line><addr-line>MI</addr-line><country>United States</country></aff><aff id="aff4"><institution>University Library, San Jose State University</institution><addr-line>San Jose</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Cahill</surname><given-names>Naomi</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bracken-Clarke</surname><given-names>Dara</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Benson</surname><given-names>Ryzen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yun Jiang, MS, PhD, School of Nursing, University of Michigan&#x2013;Ann Arbor, 400 North Ingalls Street, Ann Arbor, MI, 48109, United States, 1 7347633705, 1 7346472416; <email>jiangyu@umich.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>2</day><month>5</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e62833</elocation-id><history><date date-type="received"><day>02</day><month>06</month><year>2024</year></date><date date-type="rev-recd"><day>08</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>20</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Xiayuan Huang, Shushun Ren, Xinyue Mao, Sirui Chen, Elle Chen, Yuqi He, Yun Jiang. Originally published in JMIR Cancer (<ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org">https://cancer.jmir.org</ext-link>), 2.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cancer, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://cancer.jmir.org/">https://cancer.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://cancer.jmir.org/2025/1/e62833"/><abstract><sec><title>Background</title><p>Cancer is a life-threatening disease and a leading cause of death worldwide, with an estimated 611,000 deaths and over 2 million new cases in the United States in 2024. The rising incidence of major cancers, including among younger individuals, highlights the need for early screening and monitoring of risk factors to manage and decrease cancer risk.</p></sec><sec><title>Objective</title><p>This study aimed to leverage explainable machine learning models to identify and analyze the key risk factors associated with breast, colorectal, lung, and prostate cancers. By uncovering significant associations between risk factors and these major cancer types, we sought to enhance the understanding of cancer diagnosis risk profiles. Our goal was to facilitate more precise screening, early detection, and personalized prevention strategies, ultimately contributing to better patient outcomes and promoting health equity.</p></sec><sec sec-type="methods"><title>Methods</title><p>Deidentified electronic health record data from Medical Information Mart for Intensive Care (MIMIC)&#x2013;III was used to identify patients with 4 types of cancer who had longitudinal hospital visits prior to their diagnosis presence. Their records were matched and combined with those of patients without cancer diagnoses using propensity scores based on demographic factors. Three advanced models, penalized logistic regression, random forest, and multilayer perceptron (MLP), were conducted to identify the rank of risk factors for each cancer type, with feature importance analysis for random forest and MLP models. The rank biased overlap was adopted to compare the similarity of ranked risk factors across cancer types.</p></sec><sec sec-type="results"><title>Results</title><p>Our framework evaluated the prediction performance of explainable machine learning models, with the MLP model demonstrating the best performance. It achieved an area under the receiver operating characteristic curve of 0.78 for breast cancer (n=58), 0.76 for colorectal cancer (n=140), 0.84 for lung cancer (n=398), and 0.78 for prostate cancer (n=104), outperforming other baseline models (<italic>P</italic>&#x003C;.001). In addition to demographic risk factors, the most prominent nontraditional risk factors overlapped across models and cancer types, including hyperlipidemia (odds ratio [OR] 1.14, 95% CI 1.11&#x2010;1.17; <italic>P</italic>&#x003C;.01), diabetes (OR 1.34, 95% CI 1.29&#x2010;1.39; <italic>P</italic>&#x003C;.01), depressive disorders (OR 1.11, 95% CI 1.06&#x2010;1.16; <italic>P</italic>&#x003C;.01), heart diseases (OR 1.42, 95% CI 1.32&#x2010;1.52; <italic>P</italic>&#x003C;.01), and anemia (OR 1.22, 95% CI 1.14&#x2010;1.30; <italic>P</italic>&#x003C;.01). The similarity analysis indicated the unique risk factor pattern for lung cancer from other cancer types.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The study&#x2019;s findings demonstrated the effectiveness of explainable ML models in assessing nontraditional risk factors for major cancers and highlighted the importance of considering unique risk profiles for different cancer types. Moreover, this research served as a hypothesis-generating foundation, providing preliminary results for future investigation into cancer diagnosis risk analysis and management. Furthermore, expanding collaboration with clinical experts for external validation would be essential to refine model outputs, integrate findings into practice, and enhance their impact on patient care and cancer prevention efforts.</p></sec></abstract><kwd-group><kwd>electronic health record</kwd><kwd>EHR</kwd><kwd>cancer risk modeling</kwd><kwd>risk factor analysis</kwd><kwd>explainable machine learning</kwd><kwd>machine learning</kwd><kwd>ML</kwd><kwd>risk factor</kwd><kwd>major cancers</kwd><kwd>monitoring</kwd><kwd>cancer risk</kwd><kwd>breast cancer</kwd><kwd>colorectal cancer</kwd><kwd>lung cancer</kwd><kwd>prostate cancer</kwd><kwd>cancer patients</kwd><kwd>clinical decision-making</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Cancer is a life-threatening disease and leading cause of death worldwide. In 2024, 611,000 people were estimated to have died from cancer in the United States, and the estimated new cancer cases will reach more than 2 million for the first time [<xref ref-type="bibr" rid="ref1">1</xref>]. This surge includes rising incidence rates for major cancers, including breast, prostate, lung, and colorectal cancers, which display the trend of increasingly affecting younger individuals who have many more years of life expectancy [<xref ref-type="bibr" rid="ref1">1</xref>]. The US Preventive Services Task Force modified the recommended age for colorectal cancer screening from 50 to 45 years for people at average risk in 2021 and adjusted the recommendation for breast cancer screening for all women to start at the age of 40 years in 2024 [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Similar upward trends in the incidence of early-onset cancers are observed in other high-income countries, suggesting shared risk factors and exposures across these regions. However, besides those uncontrollable risk factors, such as previous cancer diagnosis, family history of cancer, and genetics or inherited cancer syndrome, many cancer risk factors, including lifestyle factors, are modifiable and can be managed to decrease people&#x2019;s risk for cancer [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Extensive evidence highlights the potential benefits of early identification of individuals at high risk for cancer, which can contribute to improved prevention, more effective treatment, reduced cancer burden, and better long-term outcomes. However, demonstrating a clear survival advantage [<xref ref-type="bibr" rid="ref5">5</xref>] from screening remains challenging, with notable exceptions such as cervical cancer [<xref ref-type="bibr" rid="ref6">6</xref>]. It is essential to address biases like lead-time and length bias in screening, as they can overestimate its benefits, ensuring accurate evaluations [<xref ref-type="bibr" rid="ref7">7</xref>]. In the context of breast cancer, it was estimated that early access to treatment services following breast cancer screening could have reduced breast cancer mortality by 25%&#x2010;40% [<xref ref-type="bibr" rid="ref8">8</xref>]. Given the tremendous benefits of early identification of high-risk patients, an increasing number of cancer risk prediction models have been developed [<xref ref-type="bibr" rid="ref9">9</xref>]. However, Traditional models used for cancer risk prediction, such as logistic regression (LR) and Cox regression, often demonstrate moderate discrimination accuracy, with an area under the receiver operating characteristic curve (AUC) ranging from 0.53 to 0.64 [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. These models frequently emphasize family history and may have limited generalizability, potentially introducing biases when applied to specific subpopulations [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Furthermore, nontraditional risk factors, such as chronic diseases, are often overlooked, despite evidence suggesting that chronic conditions can elevate cancer risk similarly to lifestyle factors [<xref ref-type="bibr" rid="ref16">16</xref>]. This highlights the need for more advanced methods to enhance cancer diagnosis risk prediction and support effective cancer prevention strategies.</p><p>Machine learning has shown promising potential in cancer prediction by leveraging electronic health record (EHR) data to identify risk factors [<xref ref-type="bibr" rid="ref17">17</xref>]. Current applications range from developing predictive models for early cancer detection to personalized treatment recommendations and outcome predictions, based on various patient characteristics and biomarkers. Despite these advancements, several challenges remain in cancer prediction using machine learning [<xref ref-type="bibr" rid="ref18">18</xref>]. A key issue is the need for a deeper understanding of risk factors within and across different cancer types [<xref ref-type="bibr" rid="ref19">19</xref>]. As research progresses, explainable machine learning offers a meaningful step forward in improving the efficacy and transparency of predictive models [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. These models not only enhance predictive accuracy but also provide interpretable insights into how predictions are made, fostering trust and facilitating clinical decision-making [<xref ref-type="bibr" rid="ref23">23</xref>]. By systematically identifying and excluding irrelevant features, explainable approaches can reduce noise and streamline the prediction process. However, it is important to recognize that feature selection algorithms can be sensitive to dataset characteristics, where small changes in the data may lead to differing results [<xref ref-type="bibr" rid="ref24">24</xref>]. This underscores the importance of carefully selecting features that are most relevant, contributing to a deeper understanding of cancer diagnosis risk factors and improving predictive performance.</p><p>Hence, this study presented comprehensive research aimed at uncovering the association between pivotal factors and the risks of 4 major cancer diagnoses (breast, prostate, lung, and colorectal) through the use of explainable machine learning techniques on penalized LR, random forest (RF), and multilayer perceptron (MLP). Our primary objective was to pinpoint the significant features that exert an influence on the risks associated with the diagnosis of these major cancers and to delineate the patterns of risk factors corresponding to each cancer type. Such insights can contribute to enhanced risk monitoring and patient stratification and provide valuable support for clinicians in their decision-making processes, ultimately improving the quality patient care. By elucidating these critical factors and their associated risk factor patterns, we provided clinicians valuable insights through rigorous analysis for enhancing risk monitoring and patient care across various cancer types.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Experimental Dataset</title><p>Our study was conducted using data from Medical Information Mart for Intensive Care (MIMIC)&#x2013;III, a comprehensive, structured, longitudinal EHR dataset that is publicly available [<xref ref-type="bibr" rid="ref25">25</xref>]. This dataset contains deidentified, detailed clinical data from intensive care unit (ICU) admissions between 2001 and 2012 at Beth Israel Deaconess Medical Center in Boston, Massachusetts, and is accessible to the global research community under a data use agreement. We used the most recent version (v2.0 released in January 2023) for this work which contains a broad spectrum of data, including information on individual patients&#x2019; health and health care from various inpatient and outpatient visits, such as diagnoses, prescriptions, lab tests, and procedures. These visits include emergency room admissions and subsequent hospital transfers, where a patient&#x2019;s transfer to a ward or subsequent re-admission to the ICU within the same hospitalization period was considered a single visit. In total, this dataset contains 58,976 admissions of 46,520 patients.</p><p>Additionally, we investigated the health status and prevalence of a few common chronic diseases for the MIMIC-III dataset, compared with the prevalence of these chronic diseases in the US population. The MIMIC-III dataset shows that hypertension affects 47.97% of ICU patients, while in the US population, prevalence ranges from 46.9% to 49.4% [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Diabetes mellitus is present in 21.20% of MIMIC-III patients, whereas it affects 11.6% of the US population and 14.7% of adults [<xref ref-type="bibr" rid="ref28">28</xref>]. Hypercholesterolemia appears in 14.94% of ICU cases, with US estimates between 10% and 11.4% [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Congestive heart failure is recorded in 27.38% of MIMIC-III patients, while the lifetime risk in the US is 24% [<xref ref-type="bibr" rid="ref31">31</xref>]. Esophageal reflux affects 15.33% of ICU patients and 20% of people in the US [<xref ref-type="bibr" rid="ref32">32</xref>]. Pneumonia is diagnosed in 12.46% of ICU patients, while 24.9% of US adults have reported cases [<xref ref-type="bibr" rid="ref33">33</xref>]. Anemia affects 14.02% of ICU patients, while 5.6% of the US population has the condition [<xref ref-type="bibr" rid="ref34">34</xref>]. Acquired hypothyroidism is observed in 10.71% of MIMIC-III patients and 4.6% of US adults [<xref ref-type="bibr" rid="ref35">35</xref>]. Tobacco use is recorded in 7.76% of ICU cases, while 19.8% of US adults report smoking [<xref ref-type="bibr" rid="ref36">36</xref>]. Depressive disorders affect 8.17% of ICU patients, while 9.5% of American adults have been diagnosed [<xref ref-type="bibr" rid="ref37">37</xref>]. Chronic airway obstruction is reported in 10.24% of MIMIC-III cases, while national estimates range from 6.0% to 6.1% [<xref ref-type="bibr" rid="ref38">38</xref>].</p></sec><sec id="s2-2"><title>Data Preprocessing</title><p>We included patients with 4 types of cancers (breast, colorectal, lung, and prostate) identified using <italic>International Classification of Diseases, Ninth Revision</italic> (<italic>ICD-9</italic>) codes associated with the diagnosis of each type of cancer (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>We took a few steps to preprocess the experimental dataset, starting with the consolidation of 3 main tables from the MIMIC-III database. These included: (1) foundational patient information, capturing demographics and initial hospital admission data; (2) a reference table for <italic>ICD-9</italic> codes, detailing both codes and corresponding diagnostic labels; and (3) logs of patient visit sequences with associated <italic>ICD-9</italic> codes. This consolidation linked the records via patient IDs to construct a detailed longitudinal dataset. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the data processing workflow of this study. Patients&#x2019; ages were determined by deducting their date of birth from their initial hospital admission date, with the result rounded to the nearest year. Any patient records missing demographic details (such as ethnicity, marital status, or religion) were omitted, narrowing the dataset to a total of 21,372 unique individuals. Our study focused on patients who had multiple hospital visits prior to their cancer diagnosis presence in the record to identify potential risk factors. After a cancer diagnosis code was recognized, further visits were disregarded. These records were combined with those of patients without a cancer diagnosis. A label was created as 1 if a visit included an <italic>ICD-9</italic> code for a cancer diagnosis and 0 if not. To ensure a balanced dataset in terms of cancer diagnosis, the study matched patients diagnosed with cancer with those without cancer using propensity score matching based on demographic factors. <xref ref-type="table" rid="table1">Table 1</xref> contains a detailed description of patient characteristics for 4 cancer types.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Medical Information Mart for Intensive Care (MIMIC)&#x2013;III data processing pipeline. EHR: electronic health record; <italic>ICD-9</italic>: <italic>International Classification of Diseases, Ninth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e62833_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of patients for 4 types of cancer.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Breast cancer (n=58)</td><td align="left" valign="bottom" colspan="2">Colorectal cancer (n=140)</td><td align="left" valign="bottom" colspan="2">Lung cancer (n=398)</td><td align="left" valign="bottom" colspan="2">Prostate cancer (n=104)</td></tr><tr><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">With cancer (n=29)</td><td align="left" valign="top">Without cancer (n=29)</td><td align="left" valign="top">With cancer (n=70)</td><td align="left" valign="top">Without cancer (n=70)</td><td align="left" valign="top">With cancer (n=199)</td><td align="left" valign="top">Without cancer (n=199)</td><td align="left" valign="top">With cancer (n=52)</td><td align="left" valign="top">Without cancer (n=52)</td></tr></thead><tbody><tr><td align="left" valign="top">Age (years), median (range)</td><td align="left" valign="top">60 (40&#x2010;86)</td><td align="left" valign="top">60 (39&#x2010;86)</td><td align="left" valign="top">76 (21&#x2010;87)</td><td align="left" valign="top">75.5 (29-87)</td><td align="left" valign="top">69 (39&#x2010;88)</td><td align="left" valign="top">69 (39&#x2010;87)</td><td align="left" valign="top">74.5 (52-88)</td><td align="left" valign="top">73.5 (52-88)</td></tr><tr><td align="left" valign="top">Sex, n (%)</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Female</td><td align="left" valign="top">27 (93.1)</td><td align="left" valign="top">27 (93.1)</td><td align="left" valign="top">35 (50.0)</td><td align="left" valign="top">33 (47.1)</td><td align="left" valign="top">93 (46.7)</td><td align="left" valign="top">83 (41.7)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">&#x2003;Male</td><td align="left" valign="top">2 (6.9)</td><td align="left" valign="top">2 (6.9)</td><td align="left" valign="top">35 (50.0)</td><td align="left" valign="top">37 (52.9)</td><td align="left" valign="top">106 (53.3)</td><td align="left" valign="top">116 (58.3)</td><td align="left" valign="top">52 (100)</td><td align="left" valign="top">52 (100)</td></tr><tr><td align="left" valign="top">Race, n (%)</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;White</td><td align="left" valign="top">20 (69.0)</td><td align="left" valign="top">21 (72.4)</td><td align="left" valign="top">51 (72.9)</td><td align="left" valign="top">52 (74.3)</td><td align="left" valign="top">162 (81.4)</td><td align="left" valign="top">157 (78.9)</td><td align="left" valign="top">41 (78.8)</td><td align="left" valign="top">39 (75.0)</td></tr><tr><td align="left" valign="top">&#x2003;Non-White</td><td align="left" valign="top">9 (31.0)</td><td align="left" valign="top">8 (27.6)</td><td align="left" valign="top">19 (27.1)</td><td align="left" valign="top">18 (25.7)</td><td align="left" valign="top">37 (18.6)</td><td align="left" valign="top">42 (21.1)</td><td align="left" valign="top">11 (21.2)</td><td align="left" valign="top">13 (25.0)</td></tr><tr><td align="left" valign="top">Marital status, n (%)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Married</td><td align="left" valign="top">16 (55.2)</td><td align="left" valign="top">15 (51.7)</td><td align="left" valign="top">37 (52.9)</td><td align="left" valign="top">41 (58.6)</td><td align="left" valign="top">109 (54.8)</td><td align="left" valign="top">113 (56.8)</td><td align="left" valign="top">31 (59.6)</td><td align="left" valign="top">32 (61.5)</td></tr><tr><td align="left" valign="top">&#x2003;Not married</td><td align="left" valign="top">13 (44.8)</td><td align="left" valign="top">14 (48.3)</td><td align="left" valign="top">33 (47.1)</td><td align="left" valign="top">29 (41.4)</td><td align="left" valign="top">90 (45.2)</td><td align="left" valign="top">86 (43.2)</td><td align="left" valign="top">21 (40.4)</td><td align="left" valign="top">20 (38.5)</td></tr><tr><td align="left" valign="top">Religion, n (%)</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Catholic</td><td align="left" valign="top">15 (51.7)</td><td align="left" valign="top">13 (44.8)</td><td align="left" valign="top">35 (50.0)</td><td align="left" valign="top">31 (44.3)</td><td align="left" valign="top">111 (55.8)</td><td align="left" valign="top">107 (53.8)</td><td align="left" valign="top">19 (36.5)</td><td align="left" valign="top">19 (36.5)</td></tr><tr><td align="left" valign="top">&#x2003;Jewish</td><td align="left" valign="top">7 (24.1)</td><td align="left" valign="top">7 (24.1)</td><td align="left" valign="top">17 (24.3)</td><td align="left" valign="top">18 (25.7)</td><td align="left" valign="top">33 (16.6)</td><td align="left" valign="top">31 (15.6)</td><td align="left" valign="top">11 (21.2)</td><td align="left" valign="top">10 (19.2)</td></tr><tr><td align="left" valign="top">Protestant Quaker</td><td align="left" valign="top">7 (24.1)</td><td align="left" valign="top">7 (24.1)</td><td align="left" valign="top">14 (20.0)</td><td align="left" valign="top">11 (15.7)</td><td align="left" valign="top">42 (21.1)</td><td align="left" valign="top">38 (19.1)</td><td align="left" valign="top">16 (30.8)</td><td align="left" valign="top">17 (32.7)</td></tr><tr><td align="left" valign="top">&#x2003;Other</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (10.3)</td><td align="left" valign="top">4 (5.7)</td><td align="left" valign="top">10 (14.3)</td><td align="left" valign="top">13 (6.5)</td><td align="left" valign="top">23 (11.6)</td><td align="left" valign="top">6 (11.5)</td><td align="left" valign="top">6 (11.5)</td></tr><tr><td align="left" valign="top">ICU<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> visits, n</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Mean</td><td align="left" valign="top">2.5</td><td align="left" valign="top">1.5</td><td align="left" valign="top">2.6</td><td align="left" valign="top">1.5</td><td align="left" valign="top">2.6</td><td align="left" valign="top">1.6</td><td align="left" valign="top">2.5</td><td align="left" valign="top">1.5</td></tr><tr><td align="left" valign="top">&#x2003;Maximum</td><td align="left" valign="top">5</td><td align="left" valign="top">5</td><td align="left" valign="top">6</td><td align="left" valign="top">6</td><td align="left" valign="top">10</td><td align="left" valign="top">12</td><td align="left" valign="top">7</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top">&#x2003;Minimum</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><italic>ICD-9</italic><sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> codes for each patient, n</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Mean</td><td align="left" valign="top">25</td><td align="left" valign="top">14</td><td align="left" valign="top">27</td><td align="left" valign="top">16</td><td align="left" valign="top">26</td><td align="left" valign="top">15</td><td align="left" valign="top">30</td><td align="left" valign="top">15</td></tr><tr><td align="left" valign="top">&#x2003;Maximum</td><td align="left" valign="top">51</td><td align="left" valign="top">68</td><td align="left" valign="top">81</td><td align="left" valign="top">71</td><td align="left" valign="top">82</td><td align="left" valign="top">96</td><td align="left" valign="top">63</td><td align="left" valign="top">66</td></tr><tr><td align="left" valign="top">&#x2003;Minimum</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">9</td><td align="left" valign="top">3</td><td align="left" valign="top">6</td><td align="left" valign="top">2</td><td align="left" valign="top">5</td><td align="left" valign="top">4</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Categories of marital status include &#x201C;single&#x201D;, &#x201C;divorces&#x201D;, &#x201C;widowed&#x201D;, and &#x201C;separated&#x201D;.</p></fn><fn id="table1fn2"><p><sup>b</sup>ICU: intensive care unit.</p></fn><fn id="table1fn3"><p><sup>c</sup><italic>ICD-9</italic>: <italic>International Classification of Diseases, Ninth Revision</italic>.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Feature Selection</title><p>Our experiment&#x2019;s initial dataset comprised thousands of diagnosis codes intended for predicting cancer diagnosis risk. Aware of some features&#x2019; potential redundancy and less informative nature, we did a feature selection process. This involved assessing the relevance and importance of each feature in relation to 4 specific types of cancer. We performed a correlation-based feature selection process to identify a subset of features that were highly correlated with the target cancer outcomes. This was followed by a thorough review of relevant literature and consultation with experts to validate and refine the selected features.</p></sec><sec id="s2-4"><title>Framework</title><p>In this work, we applied 3 advanced models, penalized LR, RF, and MLP, based on their demonstrated accuracy and robustness in handling high-dimensional datasets. RF and MLP excel at identifying complex, nonlinear interactions among variables without requiring predefined interaction terms. This capability is crucial for analyzing interactions between risk factors and cancer outcomes. Our choice of RF and MLP was determined by a desire to balance complexity with interpretability, as well as to ensure computational efficiency. Both methods are straightforward and offer high interpretability, which makes them excellent foundational models for exploring how different features influence cancer diagnosis risk.</p><p>Since the task aimed at forecasting cancer diagnosis risk by considering important and relevant risk factors, we evaluated the efficacy of our methodologies by employing several critical performance metrics: AUC, accuracy, specificity, sensitivity, and the <italic>F</italic><sub>1</sub>-score for each model. We partitioned the dataset into 3 sections for model development: 70% for training, 10% for validation, and 20% for testing. The model that exhibited the best results on the validation set was further subjected to an in-depth analysis of the test set, using a 3-fold cross-validation technique to calculate its AUC precisely. To enhance our understanding of how our machine learning models contribute to cancer prevention, we also quantified the impact of each feature on the prediction of 4 cancer types. We then ranked these features according to their significance. All statistical analyses and model implementations were coded using Python, with the scikit-learn library serving as the foundation for our predictive framework [<xref ref-type="bibr" rid="ref39">39</xref>]. To assess the generalizability of the model, we validated its performance using an independent ICU dataset from MIMIC-IV-ED ((Medical Information Mart for Intensive Care), which represents an extended patient population. For each cancer type, we randomly sampled 200 cases and 200 matched controls from MIMIC-IV-ED, ensuring no patient overlap with the MIMIC-III experimental dataset.</p><p>To investigate the similarity of features ranking by different cancer types, we applied rank biased overlap (RBO) [<xref ref-type="bibr" rid="ref40">40</xref>], a similarity measure of 2 ranked lists. The RBO score ranges between 0 and 1, where a higher score indicates greater similarity between the lists. A score of 1 implies perfect overlap, meaning the 2 lists are identical in both order and content. On the other hand, a score of 0 suggests no overlap between the lists.</p><p>Mathematically, let <inline-formula><mml:math id="ieqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:math></inline-formula>  be the high-dimensional feature input. Let <inline-formula><mml:math id="ieqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> be the corresponding label. <inline-formula><mml:math id="ieqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:mstyle></mml:math></inline-formula> means not affected, and <inline-formula><mml:math id="ieqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mstyle></mml:math></inline-formula>  means affected. Our goal is to learn a predictive function <inline-formula><mml:math id="ieqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula>  that best classifies the data. We built 3 state-of-the-art models for 4 cancer types respectively in this study:</p><list list-type="bullet"><list-item><p>Penalized LR: given <inline-formula><mml:math id="ieqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula>  training instances, we considered L1 regularized LR by minimizing the following function: <inline-formula><mml:math id="ieqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:munderover><mml:mo>&#x2212;</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>i</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:mi>&#x03B8;</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mstyle></mml:math></inline-formula>.</p></list-item><list-item><p>RF [<xref ref-type="bibr" rid="ref41">41</xref>]: a robust ensemble learning method that constructs multiple decision trees during training to improve prediction accuracy and prevent overfitting, where  <inline-formula><mml:math id="ieqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula> is the decision tree as base learners. The RF model was trained by iteratively selecting features from root to leaf nodes and aggregating multiple trees with the weights from a subset of the training instances. The nodes and the weights in the model reflect their importance to the final prediction.</p></list-item><list-item><p>MLP [<xref ref-type="bibr" rid="ref42">42</xref>]: a type of artificial neural network that consists of at least 3 layers of nodes: an input layer, one or more hidden layers, and an output layer. Each node, or artificial neuron, in one layer, connects with a certain weight to every node in the following layer, and nodes do not connect within the same layer. The nonlinear activation functions, such as the sigmoid, or Rectified Linear Unit, are applied to the weighted sum of inputs to a neuron, determining its output signal.</p></list-item></list><p>To rank the impact on predictive models of the features, relative to all 3 models, we used a permutation importance score to rank all features in the training models for MLP [<xref ref-type="bibr" rid="ref43">43</xref>]. The scores were defined by the mean decrease in accuracy of the trained model when each feature was permuted.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>MIMIC-III data are the result of a collaboration between Beth Israel Deaconess Medical Center (BIDMC) and Massachusetts Institute of Technology. Data collected at BIDMC as part of routine clinical care are deidentified, transformed, and made available to researchers who have completed training in human research and signed a data use agreement. The Institutional Review Board (HUM00230096) at the BIDMC granted a waiver of informed consent and approved the sharing of the research resource. This study was determined to be exempt from further ethical review. The contributing author, XH, obtained the necessary authorization to access the anonymized dataset and oversaw the meticulous data extraction process.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Feature Selection</title><p>We conducted a feature selection process to refine thousands of diagnosis codes for predicting cancer diagnosis risk, using correlation-based selection to identify the most relevant features for 4 cancer types. Through this rigorous analysis, we aimed to distill the dataset down to a more manageable and meaningful subset of features. Eventually, we identified 33 features (recategorized into 20 factors for further analysis, <xref ref-type="table" rid="table2">Table 2</xref>) that emerged as particularly crucial for accurately predicting cancer diagnosis risk. These features were meticulously curated, ensuring that only the most informative and pertinent variables were retained for our predictive models.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Features selected for predicting cancer diagnosis risks.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Features</td><td align="left" valign="bottom">Factors</td></tr></thead><tbody><tr><td align="left" valign="top">Acidosis</td><td align="left" valign="top">Acidosis</td></tr><tr><td align="left" valign="top">Acute kidney failure, unspecified</td><td align="left" valign="top">Acute kidney failure</td></tr><tr><td align="left" valign="top">Age</td><td align="left" valign="top">Age</td></tr><tr><td align="left" valign="top">Anemia, unspecified</td><td align="left" valign="top">Anemia</td></tr><tr><td align="left" valign="top">Acute posthemorrhagic anemia</td><td align="left" valign="top">Anemia</td></tr><tr><td align="left" valign="top">Depressive disorder, not elsewhere classified</td><td align="left" valign="top">Depressive disorder</td></tr><tr><td align="left" valign="top">Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled</td><td align="left" valign="top">Diabetes</td></tr><tr><td align="left" valign="top">Esophageal reflux</td><td align="left" valign="top">Esophageal reflux</td></tr><tr><td align="left" valign="top">Ethnicity</td><td align="left" valign="top">Ethnicity</td></tr><tr><td align="left" valign="top">Gender</td><td align="left" valign="top">Gender</td></tr><tr><td align="left" valign="top">Cardiac complications, not elsewhere classified</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Aortocoronary bypass status</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Coronary atherosclerosis of native coronary artery</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Old myocardial infarction</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Congestive heart failure, unspecified</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Atrial fibrillation</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Subendocardial infarction, initial episode of care</td><td align="left" valign="top">Heart disease</td></tr><tr><td align="left" valign="top">Pure hypercholesterolemia</td><td align="left" valign="top">Hyperlipidemia</td></tr><tr><td align="left" valign="top">Other and unspecified hyperlipidemia</td><td align="left" valign="top">Hyperlipidemia</td></tr><tr><td align="left" valign="top">Unspecified essential hypertension</td><td align="left" valign="top">Hypertension</td></tr><tr><td align="left" valign="top">Other iatrogenic hypotension</td><td align="left" valign="top">Hypotension</td></tr><tr><td align="left" valign="top">Unspecified acquired hypothyroidism</td><td align="left" valign="top">Hypothyroidism</td></tr><tr><td align="left" valign="top">Marital status</td><td align="left" valign="top">Marital status</td></tr><tr><td align="left" valign="top">Religion</td><td align="left" valign="top">Religion</td></tr><tr><td align="left" valign="top">Acute respiratory failure</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">Unspecified pleural effusion</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">Pneumonia, organism unspecified</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">Pneumonitis due to inhalation of food or vomitus</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">Pulmonary collapse</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">Chronic airway obstruction, not elsewhere classified</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">Unspecified septicemia</td><td align="left" valign="top">Sepsis</td></tr><tr><td align="left" valign="top">Personal history of tobacco use</td><td align="left" valign="top">Tobacco use</td></tr><tr><td align="left" valign="top">Urinary tract infection, site not specified</td><td align="left" valign="top">Urinary tract infection (UTI)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Model Performance</title><p>For each predicted cancer outcome, we carried out the experiment by predicting cancer using the entire diagnosis history of the patient by building LR, RF, and MLP models. <xref ref-type="table" rid="table3">Table 3</xref> illustrates the accuracy, specificity, sensitivity, and <italic>F</italic><sub>1</sub>-score of these 3 models for breast, colorectal, lung, and prostate cancers. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the receiver operating characteristic plots of 3 models for 4 types of cancer, respectively. Both <xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref> show that within the 3 models, MLP performs the best, RF falls in the middle, and LR ranks last. It is worth noting that MLP achieved an AUC of 0.78 for breast cancer, 0.76 for colorectal cancer, 0.84 for lung cancer, and 0.78 for prostate cancer, demonstrating a higher AUC over traditional risk factor-based models and a statistically significant superiority over random chance. The underwhelming results from the LR model led us to investigate the complexity of risk factors for prediction. Compared with LR, MLP reveals the intricate, nonlinear associations between risk factors and the likelihood of cancer, offering meaningful insights into the collective influence of these risk factors on cancer diagnosis risk.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of model performance across 4 types of cancer on Medical Information Mart for Intensive Care (MIMIC)&#x2013;III.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Breast cancer</td><td align="left" valign="bottom" colspan="3">Colorectal cancer</td><td align="left" valign="bottom" colspan="3">Lung cancer</td><td align="left" valign="bottom" colspan="3">Prostate cancer</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">MLP<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">LR</td><td align="left" valign="top">RF</td><td align="left" valign="top">MLP</td><td align="left" valign="top">LR</td><td align="left" valign="top">RF</td><td align="left" valign="top">MLP</td><td align="left" valign="top">LR</td><td align="left" valign="top">RF</td><td align="left" valign="top">MLP</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.60</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.59</td><td align="left" valign="top">0.72</td><td align="left" valign="top">0.78</td></tr><tr><td align="left" valign="top">Specificity</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.87</td><td align="left" valign="top">0.53</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.84</td></tr><tr><td align="left" valign="top">Sensitivity</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.72</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.60</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.84</td><td align="left" valign="top">0.63</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.76</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>LR: logistic regression.</p></fn><fn id="table3fn2"><p><sup>b</sup>RF: random forest.</p></fn><fn id="table3fn3"><p><sup>c</sup>MLP: multilayer perceptron.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Area under the receiver operating characteristic curve (AUC) performance of the 3 binary classification models (logistic regression [LR], random forest [RF], and multilayer perceptron [MLP]). The figure shows AUC curves of breast cancer, colorectal cancer, lung cancer, and prostate cancer for LR, RF, and MLP, respectively.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e62833_fig02.png"/></fig><p>Additionally, Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents the AUC, accuracy, specificity, sensitivity, and <italic>F</italic><sub>1</sub>-score for the 3 models across breast, colorectal, lung, and prostate cancers. Among the models evaluated, MLP demonstrated the highest performance, achieving an AUC of 0.88 for breast cancer, 0.83 for colorectal cancer, 0.90 for lung cancer, and 0.85 for prostate cancer.</p></sec><sec id="s3-3"><title>Feature Importance Analysis</title><p>We analyzed the feature importance for each cancer type further to investigate the potential impact of risk factors on cancer. <xref ref-type="table" rid="table4">Tables 4</xref> and <xref ref-type="table" rid="table5">5</xref> present the feature importance analysis of RF and MLP, showcasing the top-ranked risk factors for each type of cancer. The ranks of these factors were relatively different by model and cancer type, although some consistency can be observed across cancer types. Age emerged as the top risk factor across all 4 types of cancer; race/ethnicity ranked among the top 10 factors for all cancers from all models except for the RF-based lung cancer and prostate cancer models; gender was ranked among the top 10 in MLP-based models but not in any RF-based models; marital status and religion were presented for some types of cancer in some of the models; and tobacco use as an important factor for patients with lung and prostate cancer exclusively. However, all these demographic risk factors were included in the top 20 factors for all cancer types (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Similarly, RF-based models identified hypertension, heart diseases, respiratory/pulmonary diseases, and acute kidney failure as the common top risk factors for all types of cancers, while MLP-based models highlighted hyperlipidemia, diabetes, depressive disorder, and heart diseases. We calculated the odds ratio (OR) for each highlighted feature to assess its association with overall cancer diagnosis risk across 4 cancer types. The results indicated that hyperlipidemia had an OR of 1.14 (95% CI 1.11&#x2010;1.17; <italic>P</italic>&#x003C;.001), while diabetes showed a stronger association with an OR of 1.34 (95% CI 1.29&#x2010;1.39; <italic>P</italic>&#x003C;.01). Similarly, depressive disorders were linked to an OR of 1.11 (95% CI 1.06&#x2010;1.16 <italic>P</italic>&#x003C;.01), and heart diseases exhibited the highest association with an OR of 1.42 (95% CI 1.32&#x2010;1.52; <italic>P</italic>&#x003C;.01). Last, anemia was also significantly associated with cancer diagnosis risk, with an OR of 1.22 (95% CI 1.14&#x2010;1.30; <italic>P</italic>&#x003C;.01). These findings suggest a statistically significant relationship between these conditions and an increased risk of developing these 4 types of cancer. In MLP-based models, respiratory/pulmonary diseases and acute kidney failure were only presented as the top 10 for lung cancer. Both RF and MLP-based models pinpointed anemia as the top risk for breast cancer. <xref ref-type="fig" rid="figure3">Figure 3</xref> shows the RBO similarity scores of risk factors for 4 types of cancer according to MLP-based models. Low similarity scores are presented between lung cancer and any other 3 cancer types, all around 0.58, suggesting distinct patterns of risk factors associated with lung cancer. Risk factors for breast and prostate cancers show the most similar ranking with an RBO similarity score of 0.76. A moderate similarity score between colorectal and breast cancers is about the same as the score between colorectal and prostate cancer, both around 0.70.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Top-10 ranked features generated across 4 different cancer types in random forest.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Ranking</td><td align="left" valign="top">Breast cancer</td><td align="left" valign="top">Colorectal cancer</td><td align="left" valign="top">Lung cancer</td><td align="left" valign="top">Prostate cancer</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Age</td><td align="left" valign="top">Age</td><td align="left" valign="top">Age</td><td align="left" valign="top">Age</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Hypertension</td><td align="left" valign="top">Respiratory or pulmonary diseases<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">Hypertension</td><td align="left" valign="top">Hypertension</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Religion</td><td align="left" valign="top">Hypertension</td><td align="left" valign="top">Religion</td><td align="left" valign="top">Religion</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Marital status</td><td align="left" valign="top">Acute kidney failure</td><td align="left" valign="top">Hyperlipidemia</td><td align="left" valign="top">Heart diseases<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Respiratory or pulmonary diseases</td><td align="left" valign="top">Diabetes</td><td align="left" valign="top">Heart diseases</td><td align="left" valign="top">Marital status</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Heart diseases</td><td align="left" valign="top">Heart diseases</td><td align="left" valign="top">Acute kidney failure</td><td align="left" valign="top">UTI<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Race or ethnicity</td><td align="left" valign="top">Hyperlipidemia</td><td align="left" valign="top">UTI</td><td align="left" valign="top">Respiratory or pulmonary diseases</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Depressive disorders</td><td align="left" valign="top">Race or ethnicity</td><td align="left" valign="top">Respiratory or pulmonary diseases</td><td align="left" valign="top">Anemia</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Acute kidney failure</td><td align="left" valign="top">Religion</td><td align="left" valign="top">Marital status</td><td align="left" valign="top">Hyperthyroidism</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Anemia</td><td align="left" valign="top">Acidosis</td><td align="left" valign="top">Anemia</td><td align="left" valign="top">Diabetes</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Respiratory or pulmonary diseases include pneumonia, acute respiratory failure, chronic airway obstruction, and other respiratory or pulmonary complications.</p></fn><fn id="table4fn2"><p><sup>b</sup>Heart diseases include atrial fibrillation, myocardial infarction, congestive heart failure, coronary atherosclerosis, and other cardiac complications.</p></fn><fn id="table4fn3"><p><sup>c</sup>UTI: urinary tract infection.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Top-10 ranked features generated across 4 different cancer types in multilayer perceptron.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Ranking</td><td align="left" valign="bottom">Breast cancer</td><td align="left" valign="bottom">Colorectal cancer</td><td align="left" valign="bottom">Lung cancer</td><td align="left" valign="bottom">Prostate cancer</td></tr></thead><tbody><tr><td align="char" char="." valign="top">1</td><td align="left" valign="top">Age</td><td align="left" valign="top">Age</td><td align="left" valign="top">Tobacco use</td><td align="left" valign="top">Age</td></tr><tr><td align="char" char="." valign="top">2</td><td align="left" valign="top">Gender</td><td align="left" valign="top">Diabetes</td><td align="left" valign="top">Age</td><td align="left" valign="top">Gender</td></tr><tr><td align="char" char="." valign="top">3</td><td align="left" valign="top">Hyperlipidemia</td><td align="left" valign="top">Anemia</td><td align="left" valign="top">Respiratory or pulmonary diseases<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">Race or ethnicity</td></tr><tr><td align="char" char="." valign="top">4</td><td align="left" valign="top">Heart diseases<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">Acidosis</td><td align="left" valign="top">Gender</td><td align="left" valign="top">Tobacco use</td></tr><tr><td align="char" char="." valign="top">5</td><td align="left" valign="top">Race or ethnicity</td><td align="left" valign="top">Hyperlipidemia</td><td align="left" valign="top">Race or ethnicity</td><td align="left" valign="top">Diabetes</td></tr><tr><td align="char" char="." valign="top">6</td><td align="left" valign="top">Marital status</td><td align="left" valign="top">Sepsis</td><td align="left" valign="top">Diabetes</td><td align="left" valign="top">Hyperlipidemia</td></tr><tr><td align="char" char="." valign="top">7</td><td align="left" valign="top">Depressive disorder</td><td align="left" valign="top">Gender</td><td align="left" valign="top">Hyperlipidemia</td><td align="left" valign="top">Heart diseases</td></tr><tr><td align="char" char="." valign="top">8</td><td align="left" valign="top">Religion</td><td align="left" valign="top">Race or ethnicity</td><td align="left" valign="top">Hypertension</td><td align="left" valign="top">Marital status</td></tr><tr><td align="char" char="." valign="top">9</td><td align="left" valign="top">Anemia</td><td align="left" valign="top">Marital status</td><td align="left" valign="top">Heart diseases</td><td align="left" valign="top">Religion</td></tr><tr><td align="char" char="." valign="top">10</td><td align="left" valign="top">Hypothyroidism</td><td align="left" valign="top">Depressive disorder</td><td align="left" valign="top">Acute kidney failure</td><td align="left" valign="top">Depressive disorder</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Respiratory or pulmonary diseases include pneumonia, acute respiratory failure, chronic airway obstruction, and other respiratory or pulmonary complications.</p></fn><fn id="table5fn2"><p><sup>b</sup>Heart diseases include atrial fibrillation, myocardial infarction, congestive heart failure, coronary atherosclerosis, and other cardiac complications.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Rank biased overlap similarity score of risk factors for 4 cancer types. A high value represents high similarity, and a low value represents low similarity of risk factor ranks between 2 cancer types.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cancer_v11i1e62833_fig03.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study used comprehensive patient diagnosis histories to evaluate the association between key risk factors and cancer outcomes and identify risk factor patterns across different cancer types using penalized LR, RF, and MLP models. The analysis identified the top-ranking risk factors, including nontraditional risk factors such as the diagnosis of hyperlipidemia, diabetes, depressive disorders, heart diseases, and anemia, in addition to demographic factors such as age, sex, race/ethnicity, for the most prevalent 4 types of cancer, including breast, colorectal, lung, and prostate cancers. The model performance evaluation revealed the valuable potential of neural network-based models, especially MLPs, in oncology for predicting cancer diagnosis risks across cancer types. MLPs exhibit a strong capability to model complex, nonlinear interactions among diverse risk factors, making them potentially valuable tools for identifying patterns in cancer diagnosis risk and informing early detection strategies. However, their application in clinical interventions should be guided by a solid scientific rationale and supported by pathological models that explain the role of these risk factors in disease development. Additionally, validation across different cohorts and, ideally, prospective studies are necessary to ensure their reliability and clinical utility. This advantage is particularly important given the model&#x2019;s capacity to integrate and interpret the intricate relationships between clinical factors present in EHRs. In contrast to simpler models like LR, which struggle with the multidimensional nature of risk factors on cancer diagnosis in many cases, MLPs offer a more detailed and comprehensive analysis, enhancing our understanding of how these factors collectively impact cancer diagnosis risk and improving the precision of preventive strategies in clinical settings. Last, this study does not aim to establish causal inference but rather to examine significant overlapping risk factors that may contribute to cancer diagnosis risk, particularly those observed in patients with other medical conditions. While these diagnoses are not independent causal determinants of cancer, their presence may be associated with an increased risk. Careful consideration of these associations is essential for a comprehensive understanding of cancer risk factors and their potential interactions.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>Prior cancer risk prediction models usually focus on lifestyle factors like smoking, diet, alcohol consumption, physical activity, and sun exposure as key variables [<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref46">46</xref>]. Some models have also incorporated genetic risk factors [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. However, many of these models reported less optimal performance, such as a high specificity but low sensitivity [<xref ref-type="bibr" rid="ref46">46</xref>] or a low AUC of around 0.65 [<xref ref-type="bibr" rid="ref48">48</xref>]. Chronic diseases are often overlooked as risk factors for cancer, and they are not often targeted in cancer prevention strategies. The association between some of these diseases and cancers may partly be due to shared risk factors, such as aging, obesity, diet, and physical inactivity. However, they can also be independent risk factors for cancer. For example, diabetes mellitus has been identified as an independent risk factor for colon and rectal cancer in a meta-analysis of studies that either controlled for smoking and obesity, or smoking, obesity, and physical exercise [<xref ref-type="bibr" rid="ref49">49</xref>]. As nontraditional risk factors, the influence of certain chronic conditions on cancer has been brought to researchers&#x2019; attention in the past decade. A prospective cohort study with 405,878 participants followed for an average of 8.7 years demonstrated that 8 common chronic diseases accounted for more than 20% of cancer risk, which are comparable to 5 major lifestyle factors, such as smoking and lack of physical activity [<xref ref-type="bibr" rid="ref16">16</xref>]. These 8 chronic diseases or markers included blood pressure, total cholesterol, heart rate, diabetes, proteinuria, glomerular filtration rate, pulmonary disease, and gouty arthritis marker [<xref ref-type="bibr" rid="ref16">16</xref>]. However, as these diseases or markers were pre-selected by the researchers based on their disease burden worldwide, some other essential influential conditions might be missed. Our models confirmed most of these 8 diseases as the top-ranking risk factors. Additionally, some new conditions were revealed in our models among the top 10 factors for 4 types of cancer, such as depressive disorder, anemia, hypothyroidism, sepsis, urinary tract infection, and acidosis, which encourages further exploration. Certainly, some of these diagnoses may be directly related to the cancer itself. For example, anemia is a common symptom of metastatic breast cancer and a side effect of chemotherapy [<xref ref-type="bibr" rid="ref50">50</xref>]. In addition, sepsis and colorectal cancer have demonstrated a complex relationship and may have shared pathophysiological traits and potential bacterial associations reported by the literature [<xref ref-type="bibr" rid="ref51">51</xref>]. Notably, tobacco use and respiratory/pulmonary diseases emerged as pivotal risk factors, specifically for lung cancer, which is not surprising based on our knowledge in the field. Diabetes and anemia were highlighted as significant risk factors for colorectal cancer, which is congruent with the literature [<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. Iron deficiency has been recognized long-term as an independent predictor of colorectal cancer, which may be due to chronic blood loss from the gastrointestinal tract and the inflammation associated with malignancy [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. These conditions could have shared risk factors with cancer. However, emerging evidence implies that they may have more complicated relationships, including shared pathophysiological mechanisms that need further exploration [<xref ref-type="bibr" rid="ref56">56</xref>]. Moreover, cancer prevention strategies should consider the impact of comorbid conditions on the incidence of cancer and particularly their joint impact on cancer risks [<xref ref-type="bibr" rid="ref53">53</xref>].</p><p>Understanding the relationships between various risk factors and cancer diagnosis risk is pivotal for the early detection and prevention of cancer. In this context, our feature importance analysis using RF and MLP models pinpointed critical risk factors for different cancer types and explored patterns of these risk factors across various cancers. Although the ranks of risk factors for cancers were slightly different by the RF and MLP-based models, similar patterns were presented among the top 10 factors (<xref ref-type="table" rid="table4">Tables 4</xref> and <xref ref-type="table" rid="table5">5</xref>), which are interpretable and supported by the literature. Both models highlighted age as the predominant risk factor across all 4 types of cancer, which is evident that as age increases, the incidence rates for cancer overall climb steadily, and alongside age, demographic variables such as gender, race/ethnicity, marital status, and religion emerged within the top 10 features [<xref ref-type="bibr" rid="ref57">57</xref>]. Racial/ethnic disparities in cancer incidence and outcomes are well-known. Employing culturally tailored community awareness and education programs may increase cancer screening to improve early-stage diagnoses and modify risk behaviors for cancer prevention [<xref ref-type="bibr" rid="ref58">58</xref>]. Although there may not be existing evidence to confirm that marital status is an independent risk factor for cancer, observational studies demonstrate that married status is associated with reduced risk of cancer-specific and all-cause mortality [<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>]. Religion and spirituality are important in patient cancer care, and specifically, a systematic review suggests a positive association between religious attendance and cancer screening use [<xref ref-type="bibr" rid="ref61">61</xref>]. Our models not only confirmed the significance of these risk factors for each cancer type but also our RF-based model facilitated an interpretable analysis, allowing us to clearly rank the significance of each risk factor, while the MLP-based model provided deeper insights into complex, nonlinear interactions among the risk factors. This approach enriches our understanding of how specific risk factors influence cancer diagnosis, enhancing the potential for developing tailored intervention strategies that address the unique risk profiles associated with different cancer types and potentially shared risk patterns across prevalent cancer types.</p><p>The analysis of the similarity among risk factors for the diagnosis of 4 types of cancer also revealed interesting findings. As breast and prostate cancer are both hormone-dependent cancers, it is understandable that their importance-ranked risk factors share a high level of similarity. However, lung cancer had more unique ranked risk factors than other types of cancer, which may be because lung cancer is more sensitive to environmental risk factor exposure. The findings from our analysis underscore the shared risk factors and heterogeneous nature of cancer and highlight the importance of considering unique risk profiles for different cancer types. This also urges us to address the fundamental mechanism of risk factors leading to cancers. Such insights are crucial for developing tailored prevention strategies, optimizing screening protocols, and informing personalized treatment approaches to mitigate the burden of lung cancer and improve patient outcomes.</p></sec><sec id="s4-3"><title>Limitations</title><p>First, the use of the MIMIC-III dataset in this study on explainable machine learning for cancer risk prediction presents certain limitations that may impact the generalizability of the findings, Since the data are derived from ICU patient records, it primarily represents individuals with severe conditions, and the available <italic>ICD</italic> codes may not fully capture disease complexity, potentially leading to incomplete representations of patient conditions. Additionally, the limited sample size for patients with cancer may impact predictive performance and increase the risk of overfitting. Both limitations may affect the generalizability of the findings. To enhance the robustness of future research, integrating more recent and varied data sources and validating findings across different cohorts are essential steps. Second, one limitation comes from the application of explainable machine learning models for cancer risk prediction. Employing advanced techniques like penalized LR, RF, and MLP, this research seeks to optimize predictive accuracy. However, each model inherently embodies trade-offs: while more complex models, such as multi-layer perceptron, may enhance performance, they often compromise on interpretability. This presents significant challenges in clinical settings, where understanding the reasoning behind model predictions is crucial for acceptance and trust by medical practitioners. Third, another limitation of this study arises from the inherent nature of machine learning models, which are primarily designed to detect correlations in data and associations between features and the outcome rather than establish causal relationships. These models rely on the quality and comprehensiveness of the input data, and while they can reveal significant associative patterns, they do not focus on differentiating whether the associations observed are causal. Meanwhile, given the limited availability of patient lifestyle and socioeconomic information, additional factors related to social determinants of health, such as socioeconomic status, employment, and family size, can be considered as potential confounders within the model for future improvement. To address all the above, future work should integrate causal inference frameworks to validate the relationships suggested by the machine learning predictions and provide insights into underlying mechanisms.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In conclusion, our study established a predictive framework using EHR data to assess the association between risk factors and cancer outcomes using explainable ML models across major cancer types. We reported critical nontraditional chronic condition risk factors in addition to common demographic risk factors and outlined distinct patterns for each of the 4 cancer types studied. Additionally, we explored the similarities and differences in risk factor patterns across these cancers. These insights contribute to a better understanding of cancer risk profiles and benefit in improving cancer diagnosis and risk monitoring, offering supportive guidance for clinical decision-making.</p></sec></sec></body><back><ack><p>The authors would like to thank the Beth Israel Deaconess Medical Center and the MIT Laboratory for Computational Physiology for making the Medical Information Mart for Intensive Care (MIMIC)&#x2013;III database available to the research community. The authors would also acknowledge the support from the University of Michigan School of Nursing and the Yale School of Public Health throughout this study.</p></ack><notes><sec><title>Data Availability</title><p>The datasets analyzed during this study are available on PhysioNet [<xref ref-type="bibr" rid="ref62">62</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>XH and YJ conceived the study. XH and SR implemented the algorithm, conducted the experiments, and performed all the analyses. SR generated results visualization. XH and YJ supervised the study. XH, SR, EC, YH, and YJ wrote the manuscript. All authors provided feedback and approved the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb2">BIDMC</term><def><p>Beth Israel Deaconess Medical Center</p></def></def-item><def-item><term id="abb3">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb4"><italic>ICD-9</italic></term><def><p><italic>International Classification of Diseases, Ninth Revision</italic></p></def></def-item><def-item><term id="abb5">ICU</term><def><p>intensive care unit</p></def></def-item><def-item><term id="abb6">LR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb7">MIMIC</term><def><p>Medical Information Mart for Intensive Care</p></def></def-item><def-item><term id="abb8">MLP</term><def><p>multilayer perceptron</p></def></def-item><def-item><term id="abb9">OR</term><def><p>odds ratio</p></def></def-item><def-item><term id="abb10">RBO</term><def><p>rank biased overlap</p></def></def-item><def-item><term id="abb11">RF</term><def><p>random forest</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Siegel</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Giaquinto</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Jemal</surname><given-names>A</given-names> </name></person-group><article-title>Cancer statistics, 2024</article-title><source>CA Cancer J Clin</source><year>2024</year><volume>74</volume><issue>1</issue><fpage>12</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.3322/caac.21820</pub-id><pub-id pub-id-type="medline">38230766</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Colorectal cancer: screening</article-title><source>The US Preventive Services Task Force</source><year>2021</year><access-date>2025-04-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.uspreventiveservicestaskforce.org/uspstf/recommendation/colorectal-cancer-screening">https://www.uspreventiveservicestaskforce.org/uspstf/recommendation/colorectal-cancer-screening</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Breast cancer: screening</article-title><source>The US Preventive Services Task Force</source><year>2024</year><access-date>2025-04-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.uspreventiveservicestaskforce.org/uspstf/recommendation/breast-cancer-screening#:~:text=What%20is%20the%20Task%20Force,This%20is%20a%20B%20grade">https://www.uspreventiveservicestaskforce.org/uspstf/recommendation/breast-cancer-screening#:~:text=What%20is%20the%20Task%20Force,This%20is%20a%20B%20grade</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mansour</surname><given-names>R</given-names> </name><name name-style="western"><surname>Al-Ani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Hussaini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abdel-Razeq</surname><given-names>H</given-names> </name><name name-style="western"><surname>Al-Ibraheem</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mansour</surname><given-names>AH</given-names> </name></person-group><article-title>Modifiable risk factors for cancer in the middle East and North Africa: a scoping review</article-title><source>BMC Public Health</source><year>2024</year><month>01</month><day>18</day><volume>24</volume><issue>1</issue><fpage>223</fpage><pub-id pub-id-type="doi">10.1186/s12889-024-17787-5</pub-id><pub-id pub-id-type="medline">38238708</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fitzgerald</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Antoniou</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Fruk</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rosenfeld</surname><given-names>N</given-names> </name></person-group><article-title>The future of early cancer detection</article-title><source>Nat Med</source><year>2022</year><month>04</month><volume>28</volume><issue>4</issue><fpage>666</fpage><lpage>677</lpage><pub-id pub-id-type="doi">10.1038/s41591-022-01746-x</pub-id><pub-id pub-id-type="medline">35440720</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Byskov Petersen</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sadolin Damhus</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ryborg J&#x00F8;nsson</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Brodersen</surname><given-names>J</given-names> </name></person-group><article-title>The perception gap: how the benefits and harms of cervical cancer screening are understood in information material focusing on informed choice</article-title><source>Health Risk Soc</source><year>2020</year><month>02</month><day>17</day><volume>22</volume><issue>2</issue><fpage>177</fpage><lpage>196</lpage><pub-id pub-id-type="doi">10.1080/13698575.2020.1778645</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>CI</given-names> </name><name name-style="western"><surname>Elmore</surname><given-names>JG</given-names> </name></person-group><article-title>Beyond survival: a closer look at lead-time bias and disease-free intervals in mammography screening</article-title><source>J Natl Cancer Inst</source><year>2024</year><month>03</month><day>7</day><volume>116</volume><issue>3</issue><fpage>343</fpage><lpage>344</lpage><pub-id pub-id-type="doi">10.1093/jnci/djad254</pub-id><pub-id pub-id-type="medline">38145456</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duffy</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Tab&#x00E1;r</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yen</surname><given-names>AMF</given-names> </name><etal/></person-group><article-title>Mammography screening reduces rates of advanced and fatal breast cancers: results in 549,091 women</article-title><source>Cancer</source><year>2020</year><month>07</month><day>1</day><volume>126</volume><issue>13</issue><fpage>2971</fpage><lpage>2979</lpage><pub-id pub-id-type="doi">10.1002/cncr.32859</pub-id><pub-id pub-id-type="medline">32390151</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cintolo-Gonzalez</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Braun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Blackford</surname><given-names>AL</given-names> </name><etal/></person-group><article-title>Breast cancer risk models: a comprehensive overview of existing models, validation, and clinical applications</article-title><source>Breast Cancer Res Treat</source><year>2017</year><month>07</month><volume>164</volume><issue>2</issue><fpage>263</fpage><lpage>284</lpage><pub-id pub-id-type="doi">10.1007/s10549-017-4247-z</pub-id><pub-id pub-id-type="medline">28444533</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>An assessment of the predictive performance of current machine learning-based breast cancer risk prediction models: systematic review</article-title><source>JMIR Public Health Surveill</source><year>2022</year><month>12</month><day>29</day><volume>8</volume><issue>12</issue><fpage>e35750</fpage><pub-id pub-id-type="doi">10.2196/35750</pub-id><pub-id pub-id-type="medline">36426919</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nayan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Salari</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bozzo</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A machine learning approach to predict progression on active surveillance for prostate cancer</article-title><source>Urol Oncol</source><year>2022</year><month>04</month><volume>40</volume><issue>4</issue><fpage>161</fpage><pub-id pub-id-type="doi">10.1016/j.urolonc.2021.08.007</pub-id><pub-id pub-id-type="medline">34465541</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Le</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>X</given-names> </name></person-group><article-title>Machine learning for prediction of in-hospital mortality in lung cancer patients admitted to intensive care unit</article-title><source>PLoS One</source><year>2023</year><volume>18</volume><issue>1</issue><fpage>e0280606</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0280606</pub-id><pub-id pub-id-type="medline">36701342</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Leonard</surname><given-names>G</given-names> </name><name name-style="western"><surname>South</surname><given-names>C</given-names> </name><name name-style="western"><surname>Balentine</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Machine learning improves prediction over logistic regression on resected colon cancer patients</article-title><source>J Surg Res</source><year>2022</year><month>07</month><volume>275</volume><fpage>181</fpage><lpage>193</lpage><pub-id pub-id-type="doi">10.1016/j.jss.2022.01.012</pub-id><pub-id pub-id-type="medline">35287027</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><name name-style="western"><surname>McCarthy</surname><given-names>AM</given-names> </name><etal/></person-group><article-title>Combining breast cancer risk prediction models</article-title><source>Cancers (Basel)</source><year>2023</year><month>02</month><day>8</day><volume>15</volume><issue>4</issue><fpage>1090</fpage><pub-id pub-id-type="doi">10.3390/cancers15041090</pub-id><pub-id pub-id-type="medline">36831433</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shipe</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Deppen</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Farjah</surname><given-names>F</given-names> </name><name name-style="western"><surname>Grogan</surname><given-names>EL</given-names> </name></person-group><article-title>Developing prediction models for clinical use using logistic regression: an overview</article-title><source>J Thorac Dis</source><year>2019</year><month>03</month><volume>11</volume><issue>Suppl 4</issue><fpage>S574</fpage><lpage>S584</lpage><pub-id pub-id-type="doi">10.21037/jtd.2019.01.25</pub-id><pub-id pub-id-type="medline">31032076</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>SP</given-names> </name><etal/></person-group><article-title>Cancer risk associated with chronic diseases and disease markers: prospective cohort study</article-title><source>BMJ</source><year>2018</year><month>01</month><day>31</day><volume>360</volume><fpage>k134</fpage><pub-id pub-id-type="doi">10.1136/bmj.k134</pub-id><pub-id pub-id-type="medline">29386192</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shickel</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tighe</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Bihorac</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rashidi</surname><given-names>P</given-names> </name></person-group><article-title>Deep EHR: a survey of recent advances in deep learning techniques for electronic health record (EHR) analysis</article-title><source>IEEE J Biomed Health Inform</source><year>2018</year><month>09</month><volume>22</volume><issue>5</issue><fpage>1589</fpage><lpage>1604</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2017.2767063</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>D</given-names> </name></person-group><article-title>Global, regional and national burden of inflammatory bowel disease in 204 countries and territories from 1990 to 2019: a systematic analysis based on the Global Burden of Disease Study 2019</article-title><source>BMJ Open</source><year>2023</year><month>03</month><day>28</day><volume>13</volume><issue>3</issue><fpage>e065186</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2022-065186</pub-id><pub-id pub-id-type="medline">36977543</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steinberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yap</surname><given-names>S</given-names> </name><name name-style="western"><surname>Goldsbury</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large-scale systematic analysis of exposure to multiple cancer risk factors and the associations between exposure patterns and cancer incidence</article-title><source>Sci Rep</source><year>2021</year><month>01</month><day>27</day><volume>11</volume><issue>1</issue><fpage>2343</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-81463-6</pub-id><pub-id pub-id-type="medline">33504831</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Belle</surname><given-names>V</given-names> </name><name name-style="western"><surname>Papantonis</surname><given-names>I</given-names> </name></person-group><article-title>Principles and practice of explainable machine learning</article-title><source>Front Big Data</source><year>2021</year><volume>4</volume><fpage>688969</fpage><pub-id pub-id-type="doi">10.3389/fdata.2021.688969</pub-id><pub-id pub-id-type="medline">34278297</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gurmessa</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Jimma</surname><given-names>W</given-names> </name></person-group><article-title>Explainable machine learning for breast cancer diagnosis from mammography and ultrasound images: a systematic review</article-title><source>BMJ Health Care Inform</source><year>2024</year><month>02</month><day>2</day><volume>31</volume><issue>1</issue><fpage>e100954</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2023-100954</pub-id><pub-id pub-id-type="medline">38307616</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shulha</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hovdebo</surname><given-names>J</given-names> </name><name name-style="western"><surname>D&#x2019;Souza</surname><given-names>V</given-names> </name><name name-style="western"><surname>Thibault</surname><given-names>F</given-names> </name><name name-style="western"><surname>Harmouche</surname><given-names>R</given-names> </name></person-group><article-title>Integrating explainable machine learning in clinical decision support systems: study involving a modified design thinking approach</article-title><source>JMIR Form Res</source><year>2024</year><month>04</month><day>16</day><volume>8</volume><fpage>e50475</fpage><pub-id pub-id-type="doi">10.2196/50475</pub-id><pub-id pub-id-type="medline">38625728</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moncada-Torres</surname><given-names>A</given-names> </name><name name-style="western"><surname>van Maaren</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Hendriks</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Siesling</surname><given-names>S</given-names> </name><name name-style="western"><surname>Geleijnse</surname><given-names>G</given-names> </name></person-group><article-title>Explainable machine learning can outperform Cox regression predictions and provide insights in breast cancer survival</article-title><source>Sci Rep</source><year>2021</year><month>03</month><day>26</day><volume>11</volume><issue>1</issue><fpage>6968</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-86327-7</pub-id><pub-id pub-id-type="medline">33772109</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Han</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Efficient and stable unsupervised feature selection based on novel structured graph and data discrepancy learning</article-title><source>IEEE Trans Neural Netw Learning Syst</source><year>2024</year><volume>36</volume><issue>4</issue><fpage>6229</fpage><lpage>6243</lpage><pub-id pub-id-type="doi">10.1109/TNNLS.2024.3385838</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><article-title>Hypertension - health, United States</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nchs/hus/topics/hypertension.htm">https://www.cdc.gov/nchs/hus/topics/hypertension.htm</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>FastStats - hypertension</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2024-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nchs/fastats/hypertension.htm">https://www.cdc.gov/nchs/fastats/hypertension.htm</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>National Diabetes Statistics Report</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/diabetes/php/data-research/index.html">https://www.cdc.gov/diabetes/php/data-research/index.html</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>High cholesterol facts</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/cholesterol/data-research/facts-stats/index.html">https://www.cdc.gov/cholesterol/data-research/facts-stats/index.html</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><article-title>QuickStats: prevalence of high total cholesterol* among adults aged &#x2265;20 years,&#x2020; by age group and sex - national health and nutrition examination survey, 2015-2018</article-title><source>MMWR Morb Mortal Wkly Rep</source><year>2020</year><month>06</month><day>5</day><volume>69</volume><issue>22</issue><fpage>690</fpage><pub-id pub-id-type="doi">10.15585/mmwr.mm6922a5</pub-id><pub-id pub-id-type="medline">32497032</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bozkurt</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>T</given-names> </name><name name-style="western"><surname>Alexander</surname><given-names>KM</given-names> </name><etal/></person-group><article-title>Heart failure epidemiology and outcomes statistics: a report of the heart failure society of America</article-title><source>J Card Fail</source><year>2023</year><month>10</month><volume>29</volume><issue>10</issue><fpage>1412</fpage><lpage>1451</lpage><pub-id pub-id-type="doi">10.1016/j.cardfail.2023.07.006</pub-id><pub-id pub-id-type="medline">37797885</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Antunes</surname><given-names>C</given-names> </name><name name-style="western"><surname>Aleem</surname><given-names>A</given-names> </name><name name-style="western"><surname>Curtis</surname><given-names>SA</given-names> </name></person-group><article-title>Gastroesophageal reflux disease</article-title><source>StatPearls</source><year>2023</year><publisher-name>StatPearls Publishing</publisher-name></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>FastStats - pneumonia</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nchs/fastats/pneumonia.htm">https://www.cdc.gov/nchs/fastats/pneumonia.htm</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Moawad</surname><given-names>H</given-names> </name></person-group><article-title>Anemia facts and statistics: what you need to know</article-title><source>Verywell Health</source><year>2022</year><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.verywellhealth.com/facts-about-anemia-6503636#:~:text=According%20to%20one%20study%2C%205.6%25%20of%20the%20U.S.,and%20people%20of%20any%20age%20who%20are%20undernourished">https://www.verywellhealth.com/facts-about-anemia-6503636#:~:text=According%20to%20one%20study%2C%205.6%25%20of%20the%20U.S.,and%20people%20of%20any%20age%20who%20are%20undernourished</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wyne</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Nair</surname><given-names>L</given-names> </name><name name-style="western"><surname>Schneiderman</surname><given-names>CP</given-names> </name><etal/></person-group><article-title>Hypothyroidism prevalence in the United States: a retrospective study combining national health and nutrition examination survey and claims data, 2009-2019</article-title><source>J Endocr Soc</source><year>2022</year><month>11</month><day>17</day><volume>7</volume><issue>1</issue><fpage>bvac172</fpage><pub-id pub-id-type="doi">10.1210/jendso/bvac172</pub-id><pub-id pub-id-type="medline">36466005</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>Current cigarette smoking among adults in the United States</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/tobacco/php/data-statistics/adult-data-cigarettes/index.html">https://www.cdc.gov/tobacco/php/data-statistics/adult-data-cigarettes/index.html</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>Mental health disorder statistics</article-title><source>The Johns Hopkins University</source><access-date>2025-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hopkinsmedicine.org/health/wellness-and-prevention/mental-health-disorder-statistics">https://www.hopkinsmedicine.org/health/wellness-and-prevention/mental-health-disorder-statistics</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Carlson</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Watson</surname><given-names>KB</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Greenlund</surname><given-names>KJ</given-names> </name></person-group><article-title>Trends in the prevalence of chronic obstructive pulmonary disease among adults aged &#x2265;18 years - United States, 2011-2021</article-title><source>MMWR Morb Mortal Wkly Rep</source><year>2023</year><month>11</month><day>17</day><volume>72</volume><issue>46</issue><fpage>1250</fpage><lpage>1256</lpage><pub-id pub-id-type="doi">10.15585/mmwr.mm7246a1</pub-id><pub-id pub-id-type="medline">37971940</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gramfort</surname><given-names>A</given-names> </name><name name-style="western"><surname>Michel</surname><given-names>V</given-names> </name><name name-style="western"><surname>Thirion</surname><given-names>B</given-names> </name><name name-style="western"><surname>Grisel</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Scikit-learn: machine learning in python</article-title><source>J Mach Learn Res</source><year>2011</year><volume>12</volume><fpage>2825</fpage><lpage>2830</lpage></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sarica</surname><given-names>A</given-names> </name><name name-style="western"><surname>Quattrone</surname><given-names>A</given-names> </name></person-group><article-title>Introducing the rank-biased overlap as similarity measure for feature importance in explainable machine learning: a case study on parkinson&#x2019;s disease</article-title><conf-name>15th International Conference on Brain Informatics</conf-name><conf-date>Jul 15-17, 2022</conf-date><conf-loc>Padua, Italy</conf-loc><pub-id pub-id-type="doi">10.1007/978-3-031-15037-1_11</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>ROSENBLATT</surname><given-names>F</given-names> </name></person-group><article-title>The perceptron: a probabilistic model for information storage and organization in the brain</article-title><source>Psychol Rev</source><year>1958</year><month>11</month><volume>65</volume><issue>6</issue><fpage>386</fpage><lpage>408</lpage><pub-id pub-id-type="doi">10.1037/h0042519</pub-id><pub-id pub-id-type="medline">13602029</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nirmalraj</surname><given-names>S</given-names> </name><name name-style="western"><surname>Antony</surname><given-names>ASM</given-names> </name><name name-style="western"><surname>Srideviponmalar</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Permutation feature importance-based fusion techniques for diabetes prediction</article-title><source>Soft Comput</source><year>2023</year><pub-id pub-id-type="doi">10.1007/s00500-023-08041-y</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mertens</surname><given-names>E</given-names> </name><name name-style="western"><surname>Barrenechea-Pulache</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sagastume</surname><given-names>D</given-names> </name><name name-style="western"><surname>Vasquez</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Vandevijvere</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pe&#x00F1;alvo</surname><given-names>JL</given-names> </name></person-group><article-title>Understanding the contribution of lifestyle in breast cancer risk prediction: a systematic review of models applicable to Europe</article-title><source>BMC Cancer</source><year>2023</year><month>07</month><day>21</day><volume>23</volume><issue>1</issue><fpage>687</fpage><pub-id pub-id-type="doi">10.1186/s12885-023-11174-w</pub-id><pub-id pub-id-type="medline">37480028</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Usher-Smith</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Sharp</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Luben</surname><given-names>R</given-names> </name><name name-style="western"><surname>Griffin</surname><given-names>SJ</given-names> </name></person-group><article-title>Development and validation of lifestyle-based models to predict incidence of the most common potentially preventable cancers</article-title><source>Cancer Epidemiol Biomarkers Prev</source><year>2019</year><month>01</month><day>1</day><volume>28</volume><issue>1</issue><fpage>67</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.1158/1055-9965.EPI-18-0400</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harrison</surname><given-names>H</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Risk prediction models for kidney cancer: a systematic review</article-title><source>Eur Urol Focus</source><year>2021</year><month>11</month><volume>7</volume><issue>6</issue><fpage>1380</fpage><lpage>1390</lpage><pub-id pub-id-type="doi">10.1016/j.euf.2020.06.024</pub-id><pub-id pub-id-type="medline">32680829</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carver</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hartley</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>A</given-names> </name><etal/></person-group><article-title>CanRisk tool-a web interface for the prediction of breast and ovarian cancer risk and the likelihood of carrying genetic pathogenic variants</article-title><source>Cancer Epidemiol Biomarkers Prev</source><year>2021</year><month>03</month><volume>30</volume><issue>3</issue><fpage>469</fpage><lpage>473</lpage><pub-id pub-id-type="doi">10.1158/1055-9965.EPI-20-1319</pub-id><pub-id pub-id-type="medline">33335023</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thomas</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sakoda</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Hoffmeister</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Genome-wide modeling of polygenic risk score in colorectal cancer risk</article-title><source>Am J Hum Genet</source><year>2020</year><month>09</month><day>3</day><volume>107</volume><issue>3</issue><fpage>432</fpage><lpage>444</lpage><pub-id pub-id-type="doi">10.1016/j.ajhg.2020.07.006</pub-id><pub-id pub-id-type="medline">32758450</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yuhara</surname><given-names>H</given-names> </name><name name-style="western"><surname>Steinmaus</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Corley</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Tei</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Buffler</surname><given-names>PA</given-names> </name></person-group><article-title>Is diabetes mellitus an independent risk factor for colon cancer and rectal cancer?</article-title><source>Am J Gastroenterol</source><year>2011</year><month>11</month><volume>106</volume><issue>11</issue><fpage>1911</fpage><lpage>1921</lpage><pub-id pub-id-type="doi">10.1038/ajg.2011.301</pub-id><pub-id pub-id-type="medline">21912438</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muthanna</surname><given-names>FMS</given-names> </name><name name-style="western"><surname>Karuppannan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abdulrahman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Uitrakul</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rasool</surname><given-names>BAH</given-names> </name><name name-style="western"><surname>Mohammed</surname><given-names>AH</given-names> </name></person-group><article-title>Prevalence and associated factors of anemia among breast cancer patients undergoing chemotherapy: a prospective study</article-title><source>Adv Pharmacol Pharm Sci</source><year>2022</year><volume>2022</volume><fpage>7611733</fpage><pub-id pub-id-type="doi">10.1155/2022/7611733</pub-id><pub-id pub-id-type="medline">35464620</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Masood</surname><given-names>L</given-names> </name><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>NZ</given-names> </name><etal/></person-group><article-title>A narrative literature review on sepsis: a primary manifestation of colorectal neoplasm</article-title><source>Cureus</source><year>2023</year><month>09</month><volume>15</volume><issue>9</issue><fpage>e44803</fpage><pub-id pub-id-type="doi">10.7759/cureus.44803</pub-id><pub-id pub-id-type="medline">37809261</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stan</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Georgescu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mire&#x0219;tean</surname><given-names>CC</given-names> </name><name name-style="western"><surname>B&#x0103;dulescu</surname><given-names>F</given-names> </name></person-group><article-title>Cancer and diabetes: predictive factors in patients with metabolic syndrome</article-title><source>Diagnostics (Basel)</source><year>2023</year><month>08</month><day>11</day><volume>13</volume><issue>16</issue><fpage>2647</fpage><pub-id pub-id-type="doi">10.3390/diagnostics13162647</pub-id><pub-id pub-id-type="medline">37627906</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soltani</surname><given-names>G</given-names> </name><name name-style="western"><surname>Poursheikhani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yassi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hayatbakhsh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kerachian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kerachian</surname><given-names>MA</given-names> </name></person-group><article-title>Obesity, diabetes and the risk of colorectal adenoma and cancer</article-title><source>BMC Endocr Disord</source><year>2019</year><month>10</month><day>29</day><volume>19</volume><issue>1</issue><fpage>113</fpage><pub-id pub-id-type="doi">10.1186/s12902-019-0444-6</pub-id><pub-id pub-id-type="medline">31664994</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chardalias</surname><given-names>L</given-names> </name><name name-style="western"><surname>Papaconstantinou</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gklavas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Politou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Theodosopoulos</surname><given-names>T</given-names> </name></person-group><article-title>Iron deficiency anemia in colorectal cancer patients: is preoperative intravenous iron infusion indicated? a narrative review of the literature</article-title><source>Cancer Diagn Progn</source><year>2023</year><volume>3</volume><issue>2</issue><fpage>163</fpage><lpage>168</lpage><pub-id pub-id-type="doi">10.21873/cdp.10196</pub-id><pub-id pub-id-type="medline">36875314</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamilton</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lancashire</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sharp</surname><given-names>D</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>T</given-names> </name></person-group><article-title>The importance of anaemia in diagnosing colorectal cancer: a case-control study using electronic primary care records</article-title><source>Br J Cancer</source><year>2008</year><month>01</month><day>29</day><volume>98</volume><issue>2</issue><fpage>323</fpage><lpage>327</lpage><pub-id pub-id-type="doi">10.1038/sj.bjc.6604165</pub-id><pub-id pub-id-type="medline">18219289</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Boer</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Meijers</surname><given-names>WC</given-names> </name><name name-style="western"><surname>van der Meer</surname><given-names>P</given-names> </name><name name-style="western"><surname>van Veldhuisen</surname><given-names>DJ</given-names> </name></person-group><article-title>Cancer and heart disease: associations and relations</article-title><source>Eur J Heart Fail</source><year>2019</year><month>12</month><volume>21</volume><issue>12</issue><fpage>1515</fpage><lpage>1525</lpage><pub-id pub-id-type="doi">10.1002/ejhf.1539</pub-id><pub-id pub-id-type="medline">31321851</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="web"><article-title>Age and cancer risk</article-title><source>National Cancer Institute</source><year>2021</year><access-date>2025-04-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/about-cancer/causes-prevention/risk/age">https://www.cancer.gov/about-cancer/causes-prevention/risk/age</ext-link></comment></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zavala</surname><given-names>VA</given-names> </name><name name-style="western"><surname>Bracci</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Carethers</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Cancer health disparities in racial/ethnic minorities in the United States</article-title><source>Br J Cancer</source><year>2021</year><month>01</month><volume>124</volume><issue>2</issue><fpage>315</fpage><lpage>332</lpage><pub-id pub-id-type="doi">10.1038/s41416-020-01038-6</pub-id><pub-id pub-id-type="medline">32901135</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>C</given-names> </name></person-group><article-title>Association between marital status and all-cause mortality of patients with metastatic breast cancer: a population-based study</article-title><source>Sci Rep</source><year>2023</year><month>06</month><day>5</day><volume>13</volume><issue>1</issue><fpage>9067</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-36139-8</pub-id><pub-id pub-id-type="medline">37277464</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>ZH</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>KB</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>YZ</given-names> </name><etal/></person-group><article-title>Assessment of modifiable factors for the association of marital status with cancer-specific survival</article-title><source>JAMA Netw Open</source><year>2021</year><month>05</month><day>3</day><volume>4</volume><issue>5</issue><fpage>e2111813</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.11813</pub-id><pub-id pub-id-type="medline">34047792</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kretzler</surname><given-names>B</given-names> </name><name name-style="western"><surname>K&#x00F6;nig</surname><given-names>HH</given-names> </name><name name-style="western"><surname>Brandt</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weiss</surname><given-names>HR</given-names> </name><name name-style="western"><surname>Hajek</surname><given-names>A</given-names> </name></person-group><article-title>Religious denomination, religiosity, religious attendance, and cancer prevention. a systematic review</article-title><source>Risk Manag Healthc Policy</source><year>2022</year><volume>15</volume><fpage>45</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.2147/RMHP.S341085</pub-id><pub-id pub-id-type="medline">35079226</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="web"><source>PhysioNet</source><access-date>2025-04-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://physionet.org/">https://physionet.org/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary tables on the <italic>ICD-9</italic> codes description, model performance, and top 20 ranked features for the 4 cancer types</p><media xlink:href="cancer_v11i1e62833_app1.docx" xlink:title="DOCX File, 32 KB"/></supplementary-material></app-group></back></article>