| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| |
| ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" |
| OUTPUT_BASE="site1_data" |
| SITE_SPLITS="../subsets/site_splits" |
|
|
| echo "============================================" |
| echo "SITE 1 - Embedding Extraction" |
| echo "============================================" |
| echo "" |
|
|
| |
| mkdir -p ${OUTPUT_BASE}/train |
| mkdir -p ${OUTPUT_BASE}/test |
|
|
| |
| echo "π¦ Extracting TRAINING embeddings..." |
| python extract-embeddings.py \ |
| --root-dir ${ROOT_DIR} \ |
| --pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv \ |
| --output-dir ${OUTPUT_BASE}/train \ |
| --num-workers 8 \ |
| --checkpoint-interval 500 |
|
|
| echo "" |
| echo "β Training embeddings complete!" |
| echo "" |
|
|
| |
| echo "π¦ Extracting TEST embeddings..." |
| python extract-embeddings.py \ |
| --root-dir ${ROOT_DIR} \ |
| --pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv \ |
| --output-dir ${OUTPUT_BASE}/test \ |
| --num-workers 8 \ |
| --checkpoint-interval 500 |
|
|
| echo "" |
| echo "β Test embeddings complete!" |
| echo "" |
|
|
| |
| echo "π Preparing files for federated learning..." |
| mkdir -p ${OUTPUT_BASE}/fl_ready |
|
|
| |
| cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_train.parquet |
| cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_test.parquet |
|
|
| |
| echo "Creating site1_labels-train.csv..." |
| head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv |
| tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv |
|
|
| echo "Creating site1_labels-test.csv..." |
| head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv |
| tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv |
|
|
| echo "" |
| echo "============================================" |
| echo "SITE 1 - COMPLETE! β
" |
| echo "============================================" |
| echo "" |
| echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/" |
| ls -lh ${OUTPUT_BASE}/fl_ready/ |
| echo "" |
| echo "Files ready for federated learning:" |
| echo " β site1_embeddings_train.parquet" |
| echo " β site1_embeddings_test.parquet" |
| echo " β site1_labels-train.csv" |
| echo " β site1_labels-test.csv" |
| echo "" |
|
|