Module 01: Screening of ZWHQD Active Components
1 Overview
This module collects the blood-absorbed ingredients of Zhenwu Huangqi (ZWHQ) Decoction from the DCABM-TCM database1 (http://bionet.ncpsb.org.cn/dcabm-tcm/). After multi-stage screening, standardized active components annotated with chemical information were further acquired from PubChem2 (https://pubchem.ncbi.nlm.nih.gov/).
Screening Criteria: 1. Remove compounds without valid PubChem CID 2. Remove toxic diester-type alkaloids from Fu Zi 3. Remove basic nutrients (sugars, amino acids, common metabolites) 4. Manually add cycloastragenol (key active metabolite of Huang Qi)3
Final Output: Standardized active component dataset & supplementary tables
2 Load Packages
3 Step 1: Load Raw Data from DCABM-TCM
Input: data/raw/tcm/zwhqd_compound_dcabm.xlsx
4 Step 2: Raw Data Preprocessing
Parse compound names, CIDs, formulas, and clean formatting.
Code
source("../../scripts/utils.R")
compounds_clean <- compounds_raw %>%
mutate(across(where(is.character), trimws)) %>%
separate_rows(Compounds, sep = "\\|") %>%
mutate(
CID = str_extract(Compounds, "(?<=Pubchem CID:)[^)]+") %>% str_trim(),
DCABM.ID = str_extract(Compounds, "(?<=DCABM ID:)[^)]+") %>% str_trim(),
temp = str_remove(Compounds, "\\s+\\(Pubchem CID:[^)]+\\)")
) %>%
mutate(
temp2 = str_remove(temp, "\\([^)]+\\)$") %>% str_trim(),
Compound = str_remove(temp2, "\\([^)]+\\)$") %>% str_trim(),
Formula = str_match(temp2, "\\(([^)]+)\\)$")[, 2] %>% str_trim()
) %>%
select(-Compounds, -temp, -temp2, -DCABM.ID) %>%
mutate(across(where(is.character), trimws))4.1 Save processed data
5 Step 3: Manually Add Cycloastragenol
Code
compounds_add <- compounds_clean %>%
add_row(
Herb.Name.Chinese = "黄芪",
Herb.Name.Pinyin = "HUANG QI",
Herb.Name.English = "Astragali Radix",
Herb.Name.Latin = "Astragalus membranaceus",
Type = "Metabolite",
CID = "13943286",
Compound = "cycloastragenol",
Formula = "C30H50O5"
) %>%
arrange(Herb.Name.Pinyin, CID)
save(
compounds_add,
file = "../../data/processed/tcm/03_compounds_add_cycloastragenol.RData"
)6 Step 4: Filter Invalid & Duplicate Compounds
Code
compounds_filtered <- compounds_add %>%
filter(CID != "-", !is.na(CID), CID != "") %>%
mutate(CID = clean_cids(CID)) %>%
distinct(Herb.Name.Chinese, CID, .keep_all = TRUE) %>%
distinct(Herb.Name.Chinese, Compound, .keep_all = TRUE)
save(
compounds_filtered,
file = "../../data/processed/tcm/04_compounds_filtered.RData"
)7 Step 5: Load Manually Curated Properties & Final Filter
Input: Manual annotated compound list
Code
compounds_manual <- read.xlsx("../../data/raw/tcm/compounds_properties_manual.xlsx")
compounds_final_filtered <- compounds_manual %>%
dplyr::filter(Drop == "No") %>%
dplyr::select(
-c(Note, Compound.Toxic, Drop)
) %>%
distinct(Herb.Name.Pinyin, CID, .keep_all = TRUE)
save(
compounds_final_filtered,
file = "../../data/processed/tcm/05_compounds_final_filtered.RData"
)8 Step 6: Batch Retrieve PubChem Annotations
Code
# load( "../../data/processed/tcm/05_compounds_final_filtered.RData")
compounds_pubchem <- compounds_final_filtered %>%
mutate(CID = clean_cids(CID)) %>%
filter(!is.na(CID)) %>%
distinct(CID, .keep_all = TRUE)
properties <- c("IUPACName", "MolecularFormula", "MolecularWeight",
"SMILES", "InChI", "InChIKey")
unique_cids <- unique(compounds_pubchem$CID)
cid_batches <- split(unique_cids, ceiling(seq_along(unique_cids)/20))
plan(multisession, workers = availableCores() - 2)
with_progress({
p <- progressor(steps = length(cid_batches))
chem_info <- future_map_dfr(cid_batches, function(b) {
p()
fetch_batch(properties, b)
})
})
plan(sequential)8.1 Merge & Finalize
Code
compounds_final <- compounds_pubchem %>%
left_join(chem_info, by = "CID") %>%
select(
Herb.Name.Chinese, Herb.Name.Pinyin, Herb.Name.English,
Compound.Name, IUPACName, Compound.Category,
MolecularFormula, MolecularWeight, CID, SMILES, InChI, InChIKey
) %>%
mutate(across(where(is.character), trimws))
save(
compounds_final,
file = "../../data/processed/tcm/06_compounds_final.RData"
)9 Step 7: Compound Sharing Statistics (Across Herbs)
Code
compound_sharing <- compounds_final %>%
distinct(Herb.Name.Pinyin, CID) %>%
group_by(CID) %>%
summarise(
Herb_Count = n(),
Herbs = paste(unique(Herb.Name.Pinyin), collapse = ", ")
) %>%
filter(Herb_Count >= 2) %>%
arrange(desc(Herb_Count))
save(
compound_sharing,
file = "../../results/processed/module01_compound_sharing_stats.RData"
)10 Step8: download SDF
11 Results
12 Final Active Components Count
13 Export Final Tables
13.1 Supplementary Table
14 Save all final results
| Home | About | Methods | Results | Next Module |