Module 01: Screening of ZWHQD Active Components

Authors

Affiliations

Kun Hou

Health Science Center, Xi’an Jiaotong University

Hanzhong Traditional Chinese Medicine Hospital

Supervisor’s name

Health Science Center, Xi’an Jiaotong University

The First Affiliated Hospital of Xi’an Jiaotong University

1 Overview

This module collects the blood-absorbed ingredients of Zhenwu Huangqi (ZWHQ) Decoction from the DCABM-TCM database¹ (http://bionet.ncpsb.org.cn/dcabm-tcm/). After multi-stage screening, standardized active components annotated with chemical information were further acquired from PubChem² (https://pubchem.ncbi.nlm.nih.gov/).

Screening Criteria: 1. Remove compounds without valid PubChem CID 2. Remove toxic diester-type alkaloids from Fu Zi 3. Remove basic nutrients (sugars, amino acids, common metabolites) 4. Manually add cycloastragenol (key active metabolite of Huang Qi)³

Final Output: Standardized active component dataset & supplementary tables

2 Load Packages

Code

library(tidyverse)
library(data.table)
library(openxlsx)
library(furrr)
library(progressr)

3 Step 1: Load Raw Data from DCABM-TCM

Input: data/raw/tcm/zwhqd_compound_dcabm.xlsx

Code

rm(list = ls())

compounds_raw <- read.xlsx("../../data/raw/tcm/zwhqd_compound_dcabm.xlsx")

save(
  compounds_raw,
  file = "../../data/processed/tcm/01_compounds_raw.RData"
)

4 Step 2: Raw Data Preprocessing

Parse compound names, CIDs, formulas, and clean formatting.

Code

source("../../scripts/utils.R")

compounds_clean <- compounds_raw %>%
  mutate(across(where(is.character), trimws)) %>%
  separate_rows(Compounds, sep = "\\|") %>%
  mutate(
    CID = str_extract(Compounds, "(?<=Pubchem CID:)[^)]+") %>% str_trim(),
    DCABM.ID = str_extract(Compounds, "(?<=DCABM ID:)[^)]+") %>% str_trim(),
    temp = str_remove(Compounds, "\\s+\\(Pubchem CID:[^)]+\\)")
  ) %>%
  mutate(
    temp2 = str_remove(temp, "\\([^)]+\\)$") %>% str_trim(),
    Compound = str_remove(temp2, "\\([^)]+\\)$") %>% str_trim(),
    Formula = str_match(temp2, "\\(([^)]+)\\)$")[, 2] %>% str_trim()
  ) %>%
  select(-Compounds, -temp, -temp2, -DCABM.ID) %>%
  mutate(across(where(is.character), trimws))

4.1 Save processed data

Code

save(
  compounds_clean,
  file = "../../data/processed/tcm/02_compounds_clean.RData"
)

write.xlsx(
  compounds_clean,
  "../../data/processed/tcm/02_compounds_clean.xlsx"
)

5 Step 3: Manually Add Cycloastragenol

Code

compounds_add <- compounds_clean %>%
  add_row(
    Herb.Name.Chinese = "黄芪",
    Herb.Name.Pinyin = "HUANG QI",
    Herb.Name.English = "Astragali Radix",
    Herb.Name.Latin = "Astragalus membranaceus",
    Type = "Metabolite",
    CID = "13943286",
    Compound = "cycloastragenol",
    Formula = "C30H50O5"
  ) %>%
  arrange(Herb.Name.Pinyin, CID)

save(
  compounds_add,
  file = "../../data/processed/tcm/03_compounds_add_cycloastragenol.RData"
)

6 Step 4: Filter Invalid & Duplicate Compounds

Code

compounds_filtered <- compounds_add %>%
  filter(CID != "-", !is.na(CID), CID != "") %>%
  mutate(CID = clean_cids(CID)) %>%
  distinct(Herb.Name.Chinese, CID, .keep_all = TRUE) %>%
  distinct(Herb.Name.Chinese, Compound, .keep_all = TRUE)

save(
  compounds_filtered,
  file = "../../data/processed/tcm/04_compounds_filtered.RData"
)

7 Step 5: Load Manually Curated Properties & Final Filter

Input: Manual annotated compound list

Code

compounds_manual <- read.xlsx("../../data/raw/tcm/compounds_properties_manual.xlsx")

compounds_final_filtered <- compounds_manual %>%
  dplyr::filter(Drop == "No") %>%
  dplyr::select(
    -c(Note, Compound.Toxic, Drop)
  ) %>%
  distinct(Herb.Name.Pinyin, CID, .keep_all = TRUE)

save(
  compounds_final_filtered,
  file = "../../data/processed/tcm/05_compounds_final_filtered.RData"
)

8 Step 6: Batch Retrieve PubChem Annotations

Code

# load( "../../data/processed/tcm/05_compounds_final_filtered.RData")
compounds_pubchem <- compounds_final_filtered %>%
  mutate(CID = clean_cids(CID)) %>%
  filter(!is.na(CID)) %>%
  distinct(CID, .keep_all = TRUE)

properties <- c("IUPACName", "MolecularFormula", "MolecularWeight",
                "SMILES", "InChI", "InChIKey")

unique_cids <- unique(compounds_pubchem$CID)
cid_batches <- split(unique_cids, ceiling(seq_along(unique_cids)/20))

plan(multisession, workers = availableCores() - 2)
with_progress({
  p <- progressor(steps = length(cid_batches))
  chem_info <- future_map_dfr(cid_batches, function(b) {
    p()
    fetch_batch(properties, b)
  })
})
plan(sequential)

8.1 Merge & Finalize

Code

compounds_final <- compounds_pubchem %>%
  left_join(chem_info, by = "CID") %>%
  select(
    Herb.Name.Chinese, Herb.Name.Pinyin, Herb.Name.English,
    Compound.Name, IUPACName, Compound.Category,
    MolecularFormula, MolecularWeight, CID, SMILES, InChI, InChIKey
  ) %>%
  mutate(across(where(is.character), trimws))

save(
  compounds_final,
  file = "../../data/processed/tcm/06_compounds_final.RData"
)

9 Step 7: Compound Sharing Statistics (Across Herbs)

Code

compound_sharing <- compounds_final %>%
  distinct(Herb.Name.Pinyin, CID) %>%
  group_by(CID) %>%
  summarise(
    Herb_Count = n(),
    Herbs = paste(unique(Herb.Name.Pinyin), collapse = ", ")
  ) %>%
  filter(Herb_Count >= 2) %>%
  arrange(desc(Herb_Count))

save(
  compound_sharing,
  file = "../../results/processed/module01_compound_sharing_stats.RData"
)

10 Step8: download SDF

Code

library(httr)

unique_cids <- unique(compounds_pubchem$CID)
sdf_files <- "../../data/raw/tcm/sdf/"

results <- download_all_sdf(unique_cids, sdf_files)

# Check actual files
actual_files <- list.files(sdf_files, pattern = "\\.sdf$")
cat("\nActual files downloaded:", length(actual_files), "\n")

11 Results

12 Final Active Components Count

Code

cat("Total active compounds:", n_distinct(compounds_final$CID), "\n")
table(compounds_final$Herb.Name.Pinyin)

13 Export Final Tables

13.1 Supplementary Table

Code

# Table S1
write.xlsx(
  compounds_clean,
  "../../tables/supplementary/Table_S1_Raw_Compounds_DCABM-TCM.xlsx"
)

# Table S2
write.xlsx(
  compounds_final,
  "../../tables/supplementary/Table_S2_Final_Active_Components.xlsx"
)

14 Save all final results

Code

save(
  compounds_final, compound_sharing,
  file = "../../results/processed/module01_final_results.RData"
)

Home | About | Methods | Results

Next Module

References

(1)

Liu, X.; Liu, J.; Fu, B.; Chen, R.; Jiang, J.; Chen, H.; Li, R.; Xing, L.; Yuan, L.; Chen, X.; Zhang, J.; Li, H.; Guo, S.; Guo, F.; Guo, J.; Liu, Y.; Qi, Y.; Yu, B.; Xu, F.; Li, D.; Liu, Z. DCABM-TCM: A Database of Constituents Absorbed into the Blood and Metabolites of Traditional Chinese Medicine. Journal of Chemical Information and Modeling 2023, 63 (15), 4948–4959. https://doi.org/10.1021/acs.jcim.3c00365.

(2)

Kim, S.; Chen, J.; Cheng, T.; Gindulyte, A.; He, J.; He, S.; Li, Q.; Shoemaker, B. A.; Thiessen, P. A.; Yu, B.; Zaslavsky, L.; Zhang, J.; Bolton, E. E. PubChem 2025 Update. Nucleic Acids Research 2025, 53 (D1), D1516–D1525. https://doi.org/10.1093/nar/gkae1059.

(3)

Zhao, C.; Yang, X.; Yao, M.; Song, X.; Dai, J.; He, P. Cycloastragenol in Inflammation-Related Diseases: Mechanisms, Pharmacokinetics, and Translational Prospects. Frontiers in Pharmacology 2026, 16, 1732996. https://doi.org/10.3389/fphar.2025.1732996.

--- title: "Module 01: Screening of ZWHQD Active Components" format: html: toc: true number-sections: true --- ```{r setup, include=FALSE, warning=FALSE} knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, fig.dpi = 300, fig.align = "center" ) options(timeout = 36000) options(stringsAsFactors = FALSE) options(download.file.method = "curl") options(download.file.extra = "-k -L") my_repos <- BiocManager::repositories() my_repos["CRAN"] <- "https://mirrors.tuna.tsinghua.edu.cn/CRAN/" options(repos = my_repos) ``` # Overview This module collects the blood-absorbed ingredients of **Zhenwu Huangqi (ZWHQ) Decoction** from the DCABM-TCM database @liu2023dcabm (http://bionet.ncpsb.org.cn/dcabm-tcm/). After multi-stage screening, standardized active components annotated with chemical information were further acquired from PubChem @kim2025pubchem (https://pubchem.ncbi.nlm.nih.gov/). **Screening Criteria**: 1. Remove compounds without valid PubChem CID 2. Remove toxic diester-type alkaloids from Fu Zi 3. Remove basic nutrients (sugars, amino acids, common metabolites) 4. Manually add cycloastragenol (key active metabolite of Huang Qi)@zhao2025cycloastragenol **Final Output**: Standardized active component dataset & supplementary tables --- # Load Packages ```{r pkgs} library(tidyverse) library(data.table) library(openxlsx) library(furrr) library(progressr) ``` --- # Step 1: Load Raw Data from DCABM-TCM **Input**: `data/raw/tcm/zwhqd_compound_dcabm.xlsx` ```{r dcabm} rm(list = ls()) compounds_raw <- read.xlsx("../../data/raw/tcm/zwhqd_compound_dcabm.xlsx") save( compounds_raw, file = "../../data/processed/tcm/01_compounds_raw.RData" ) ``` --- # Step 2: Raw Data Preprocessing Parse compound names, CIDs, formulas, and clean formatting. ```{r} source("../../scripts/utils.R") compounds_clean <- compounds_raw %>% mutate(across(where(is.character), trimws)) %>% separate_rows(Compounds, sep = "\\|") %>% mutate( CID = str_extract(Compounds, "(?<=Pubchem CID:)[^)]+") %>% str_trim(), DCABM.ID = str_extract(Compounds, "(?<=DCABM ID:)[^)]+") %>% str_trim(), temp = str_remove(Compounds, "\\s+\$Pubchem CID:[^)]+\$") ) %>% mutate( temp2 = str_remove(temp, "\$[^)]+\$$") %>% str_trim(), Compound = str_remove(temp2, "\$[^)]+\$$") %>% str_trim(), Formula = str_match(temp2, "\$([^)]+)\$$")[, 2] %>% str_trim() ) %>% select(-Compounds, -temp, -temp2, -DCABM.ID) %>% mutate(across(where(is.character), trimws)) ``` ## Save processed data ```{r} save( compounds_clean, file = "../../data/processed/tcm/02_compounds_clean.RData" ) write.xlsx( compounds_clean, "../../data/processed/tcm/02_compounds_clean.xlsx" ) ``` --- # Step 3: Manually Add Cycloastragenol ```{r} compounds_add <- compounds_clean %>% add_row( Herb.Name.Chinese = "黄芪", Herb.Name.Pinyin = "HUANG QI", Herb.Name.English = "Astragali Radix", Herb.Name.Latin = "Astragalus membranaceus", Type = "Metabolite", CID = "13943286", Compound = "cycloastragenol", Formula = "C30H50O5" ) %>% arrange(Herb.Name.Pinyin, CID) save( compounds_add, file = "../../data/processed/tcm/03_compounds_add_cycloastragenol.RData" ) ``` --- # Step 4: Filter Invalid & Duplicate Compounds ```{r} compounds_filtered <- compounds_add %>% filter(CID != "-", !is.na(CID), CID != "") %>% mutate(CID = clean_cids(CID)) %>% distinct(Herb.Name.Chinese, CID, .keep_all = TRUE) %>% distinct(Herb.Name.Chinese, Compound, .keep_all = TRUE) save( compounds_filtered, file = "../../data/processed/tcm/04_compounds_filtered.RData" ) ``` --- # Step 5: Load Manually Curated Properties & Final Filter **Input**: Manual annotated compound list ```{r} compounds_manual <- read.xlsx("../../data/raw/tcm/compounds_properties_manual.xlsx") compounds_final_filtered <- compounds_manual %>% dplyr::filter(Drop == "No") %>% dplyr::select( -c(Note, Compound.Toxic, Drop) ) %>% distinct(Herb.Name.Pinyin, CID, .keep_all = TRUE) save( compounds_final_filtered, file = "../../data/processed/tcm/05_compounds_final_filtered.RData" ) ``` --- # Step 6: Batch Retrieve PubChem Annotations ```{r} # load( "../../data/processed/tcm/05_compounds_final_filtered.RData") compounds_pubchem <- compounds_final_filtered %>% mutate(CID = clean_cids(CID)) %>% filter(!is.na(CID)) %>% distinct(CID, .keep_all = TRUE) properties <- c("IUPACName", "MolecularFormula", "MolecularWeight", "SMILES", "InChI", "InChIKey") unique_cids <- unique(compounds_pubchem$CID) cid_batches <- split(unique_cids, ceiling(seq_along(unique_cids)/20)) plan(multisession, workers = availableCores() - 2) with_progress({ p <- progressor(steps = length(cid_batches)) chem_info <- future_map_dfr(cid_batches, function(b) { p() fetch_batch(properties, b) }) }) plan(sequential) ``` ## Merge & Finalize ```{r} compounds_final <- compounds_pubchem %>% left_join(chem_info, by = "CID") %>% select( Herb.Name.Chinese, Herb.Name.Pinyin, Herb.Name.English, Compound.Name, IUPACName, Compound.Category, MolecularFormula, MolecularWeight, CID, SMILES, InChI, InChIKey ) %>% mutate(across(where(is.character), trimws)) save( compounds_final, file = "../../data/processed/tcm/06_compounds_final.RData" ) ``` --- # Step 7: Compound Sharing Statistics (Across Herbs) ```{r} compound_sharing <- compounds_final %>% distinct(Herb.Name.Pinyin, CID) %>% group_by(CID) %>% summarise( Herb_Count = n(), Herbs = paste(unique(Herb.Name.Pinyin), collapse = ", ") ) %>% filter(Herb_Count >= 2) %>% arrange(desc(Herb_Count)) save( compound_sharing, file = "../../results/processed/module01_compound_sharing_stats.RData" ) ``` # Step8: download SDF ```{r} library(httr) unique_cids <- unique(compounds_pubchem$CID) sdf_files <- "../../data/raw/tcm/sdf/" results <- download_all_sdf(unique_cids, sdf_files) # Check actual files actual_files <- list.files(sdf_files, pattern = "\\.sdf$") cat("\nActual files downloaded:", length(actual_files), "\n") ``` --- # Results # Final Active Components Count ```{r} cat("Total active compounds:", n_distinct(compounds_final$CID), "\n") table(compounds_final$Herb.Name.Pinyin) ``` --- # Export Final Tables ## Supplementary Table ```{r} # Table S1 write.xlsx( compounds_clean, "../../tables/supplementary/Table_S1_Raw_Compounds_DCABM-TCM.xlsx" ) # Table S2 write.xlsx( compounds_final, "../../tables/supplementary/Table_S2_Final_Active_Components.xlsx" ) ``` # Save all final results ```{r} save( compounds_final, compound_sharing, file = "../../results/processed/module01_final_results.RData" ) ``` --- <table class="nav-table" width="100%"> <tr> <td align="left"> [Home](../../index.qmd) | [About](../../about.qmd) | [Methods](../../methods.qmd) | [Results](../../results.qmd) </td> <td align="right"> [Next Module](02_component_target_prediction.qmd) </td> </tr> </table> # References {-}