{ "cells": [ { "cell_type": "markdown", "id": "715b70e8-5218-4542-bb99-33d72ecb44d6", "metadata": {}, "source": [ "# Preprocessing for bulk data \n", "\n", "The bulk sample you want to deconvolute using _MethylBERT_ also needs to be preprocessed using `finetune_data_generate` function. " ] }, { "cell_type": "code", "execution_count": 1, "id": "e1168bb7-e72e-43ba-b858-23d6dc05abf0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DMRs sorted by areaStat\n", " chr start end length nCG meanMethy1 meanMethy2 \\\n", "1 chr10 134597480 134602875.0 5396 670 0.861029 0.140400 \n", "0 chr7 1268957 1277884.0 8928 753 0.793278 0.129747 \n", "2 chr4 1395812 1402597.0 6786 663 0.831162 0.185272 \n", "5 chr16 54962053 54967980.0 5928 546 0.783631 0.096095 \n", "9 chr18 76736906 76741580.0 4675 510 0.829475 0.104403 \n", "\n", " diff.Methy areaStat abs_areaStat abs_diff.Methy ctype dmr_id \n", "1 0.720629 6144.089331 6144.089331 0.720629 T 0 \n", "0 0.663531 5722.091790 5722.091790 0.663531 T 1 \n", "2 0.645891 4941.410089 4941.410089 0.645891 T 2 \n", "5 0.687536 4714.551799 4714.551799 0.687536 T 3 \n", "9 0.725072 4684.608381 4684.608381 0.725072 T 4 \n", "Number of DMRs to extract sequence reads: 20\n", "Fine-tuning data generated: name flag ref_name ref_pos \\\n", "0 SRR10166000.9089788_9089788_length=151 147 chr10 131767360 \n", "1 SRR10165998.65829390_65829390_length=150 163 chr4 20254248 \n", "2 SRR10165467.85837758_85837758_length=151 99 chr4 1401206 \n", "3 SRR10165995.16747267_16747267_length=149 83 chr2 176945656 \n", "4 SRR10165995.46034072_46034072_length=151 99 chr4 20253524 \n", "\n", " map_quality cigar next_ref_name next_ref_pos length \\\n", "0 42 151M = 131767187 -324 \n", "1 23 151M = 20254343 244 \n", "2 40 151M = 1401285 227 \n", "3 40 149M = 176945572 -233 \n", "4 40 151M = 20253771 398 \n", "\n", " seq ... NM \\\n", "0 GTGGAGTGTCGTTGCGTAGTCGGGAGTCGGGAGTAGAATAGTTTGG... ... 49 \n", "1 GGGGATTCTACCTTTACCATCAAATATCTACCGCGAAACTACGACT... ... 35 \n", "2 AAAATGAGAGATTGTTTGTTTTTTTTAATTTGTTTTTAAAAGGGGG... ... 40 \n", "3 AAATAACTTAATCTACTTCTCTCCGACCAAACCCAACCCCAAATAC... ... 35 \n", "4 TCGGATTTGGTGTTATTTATTTGGGAAGCGTCCGGACGGCGGAGCT... ... 2 \n", "\n", " XM XR \\\n", "0 ........xZ.x..Z.x..xZ.....xZ.....x....x..hx...... GA \n", "1 H..............h......xh.h...x..Z.Zx.h..x.Zx..... GA \n", "2 ...........x..h....hhh.h....hxz.hhhhh............ CT \n", "3 x...hh...hh.............Z.....h.........z.h...... CT \n", "4 .Z...h......................Z.hXZ...Z..Z....H.... CT \n", "\n", " PG RG \\\n", "0 MarkDuplicates-287B47C6 diffuse_large_B_cell_lymphoma_test_8 \n", "1 MarkDuplicates-3DAAB091 diffuse_large_B_cell_lymphoma_test_8 \n", "2 MarkDuplicates-36E4BA78 Bcell_noncancer_test_8 \n", "3 MarkDuplicates-74536757 diffuse_large_B_cell_lymphoma_test_8 \n", "4 MarkDuplicates-74536757 diffuse_large_B_cell_lymphoma_test_8 \n", "\n", " dna_seq \\\n", "0 GTG TGG GGA GAG AGT GTG TGC GCC CCG CGC GCT CT... \n", "1 GTT TTT TTC TCT CTT TTC TCT CTA TAC ACC CCT CT... \n", "2 AAA AAA AAT ATG TGA GAG AGA GAG AGA GAC ACT CT... \n", "3 GAA AAT ATG TGG GGC GCT CTT TTG TGG GGT GTC TC... \n", "4 TCG CGG GGA GAC ACT CTT TTG TGG GGT GTG TGT GT... \n", "\n", " methyl_seq dmr_ctype dmr_label ctype \n", "0 2222222212222122222122222212222222222222222222... T 5 NA \n", "1 2222222222222222222222222222221212222222122222... T 19 NA \n", "2 2222222222222222222222222222202222222222222222... T 2 NA \n", "3 2222222222222222222222122222222222222202222222... T 12 NA \n", "4 1222222222222222222222222221222122212212222222... T 19 NA \n", "\n", "[5 rows x 23 columns]\n" ] } ], "source": [ "from methylbert.data import finetune_data_generate as fdg\n", "\n", "f_bam = \"../test/data/bulk.bam\"\n", "f_dmr = \"../test/data/dmrs.csv\"\n", "f_ref = \"../../../genome/hg19.fa\"\n", "out_dir = \"tmp/\"\n", "\n", "fdg.finetune_data_generate(\n", " input_file = f_bam,\n", " f_dmr = f_dmr,\n", " f_ref = f_ref,\n", " output_dir=out_dir,\n", " n_mers=3, # 3-mer DNA sequences \n", " n_cores=20\n", ")" ] }, { "cell_type": "markdown", "id": "01a4e20f-6589-4111-b3ce-b88956dfe926", "metadata": {}, "source": [ "This process generates a new file `data.csv` where the preprocessed bulk data is contained. " ] }, { "cell_type": "code", "execution_count": 4, "id": "681b9ccf-5afc-42c3-b2d9-0b7995effbd7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data.csv dmrs.csv test_seq.csv train_seq.csv\n" ] } ], "source": [ "ls tmp/" ] }, { "cell_type": "markdown", "id": "a709eb70-0dd2-446c-907b-a2dff96c016f", "metadata": {}, "source": [ "Since the cell-type information is not given with the bulk sample, `ctype` column only contains `NaN` value. " ] }, { "cell_type": "code", "execution_count": 3, "id": "c52e4fef-32bb-4f68-9710-43cf8f1c76c7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameflagref_nameref_posmap_qualitycigarnext_ref_namenext_ref_poslengthseq...NMXMXRPGRGdna_seqmethyl_seqdmr_ctypedmr_labelctype
0SRR10166000.9089788_9089788_length=151147chr1013176736042151M=131767187-324GTGGAGTGTCGTTGCGTAGTCGGGAGTCGGGAGTAGAATAGTTTGG......49........xZ.x..Z.x..xZ.....xZ.....x....x..hx......GAMarkDuplicates-287B47C6diffuse_large_B_cell_lymphoma_test_8GTG TGG GGA GAG AGT GTG TGC GCC CCG CGC GCT CT...2222222212222122222122222212222222222222222222...T5NaN
1SRR10165998.65829390_65829390_length=150163chr42025424823151M=20254343244GGGGATTCTACCTTTACCATCAAATATCTACCGCGAAACTACGACT......35H..............h......xh.h...x..Z.Zx.h..x.Zx.....GAMarkDuplicates-3DAAB091diffuse_large_B_cell_lymphoma_test_8GTT TTT TTC TCT CTT TTC TCT CTA TAC ACC CCT CT...2222222222222222222222222222221212222222122222...T19NaN
2SRR10165467.85837758_85837758_length=15199chr4140120640151M=1401285227AAAATGAGAGATTGTTTGTTTTTTTTAATTTGTTTTTAAAAGGGGG......40...........x..h....hhh.h....hxz.hhhhh............CTMarkDuplicates-36E4BA78Bcell_noncancer_test_8AAA AAA AAT ATG TGA GAG AGA GAG AGA GAC ACT CT...2222222222222222222222222222202222222222222222...T2NaN
3SRR10165995.16747267_16747267_length=14983chr217694565640149M=176945572-233AAATAACTTAATCTACTTCTCTCCGACCAAACCCAACCCCAAATAC......35x...hh...hh.............Z.....h.........z.h......CTMarkDuplicates-74536757diffuse_large_B_cell_lymphoma_test_8GAA AAT ATG TGG GGC GCT CTT TTG TGG GGT GTC TC...2222222222222222222222122222222222222202222222...T12NaN
4SRR10165995.46034072_46034072_length=15199chr42025352440151M=20253771398TCGGATTTGGTGTTATTTATTTGGGAAGCGTCCGGACGGCGGAGCT......2.Z...h......................Z.hXZ...Z..Z....H....CTMarkDuplicates-74536757diffuse_large_B_cell_lymphoma_test_8TCG CGG GGA GAC ACT CTT TTG TGG GGT GTG TGT GT...1222222222222222222222222221222122212212222222...T19NaN
\n", "

5 rows × 23 columns

\n", "
" ], "text/plain": [ " name flag ref_name ref_pos \\\n", "0 SRR10166000.9089788_9089788_length=151 147 chr10 131767360 \n", "1 SRR10165998.65829390_65829390_length=150 163 chr4 20254248 \n", "2 SRR10165467.85837758_85837758_length=151 99 chr4 1401206 \n", "3 SRR10165995.16747267_16747267_length=149 83 chr2 176945656 \n", "4 SRR10165995.46034072_46034072_length=151 99 chr4 20253524 \n", "\n", " map_quality cigar next_ref_name next_ref_pos length \\\n", "0 42 151M = 131767187 -324 \n", "1 23 151M = 20254343 244 \n", "2 40 151M = 1401285 227 \n", "3 40 149M = 176945572 -233 \n", "4 40 151M = 20253771 398 \n", "\n", " seq ... NM \\\n", "0 GTGGAGTGTCGTTGCGTAGTCGGGAGTCGGGAGTAGAATAGTTTGG... ... 49 \n", "1 GGGGATTCTACCTTTACCATCAAATATCTACCGCGAAACTACGACT... ... 35 \n", "2 AAAATGAGAGATTGTTTGTTTTTTTTAATTTGTTTTTAAAAGGGGG... ... 40 \n", "3 AAATAACTTAATCTACTTCTCTCCGACCAAACCCAACCCCAAATAC... ... 35 \n", "4 TCGGATTTGGTGTTATTTATTTGGGAAGCGTCCGGACGGCGGAGCT... ... 2 \n", "\n", " XM XR \\\n", "0 ........xZ.x..Z.x..xZ.....xZ.....x....x..hx...... GA \n", "1 H..............h......xh.h...x..Z.Zx.h..x.Zx..... GA \n", "2 ...........x..h....hhh.h....hxz.hhhhh............ CT \n", "3 x...hh...hh.............Z.....h.........z.h...... CT \n", "4 .Z...h......................Z.hXZ...Z..Z....H.... CT \n", "\n", " PG RG \\\n", "0 MarkDuplicates-287B47C6 diffuse_large_B_cell_lymphoma_test_8 \n", "1 MarkDuplicates-3DAAB091 diffuse_large_B_cell_lymphoma_test_8 \n", "2 MarkDuplicates-36E4BA78 Bcell_noncancer_test_8 \n", "3 MarkDuplicates-74536757 diffuse_large_B_cell_lymphoma_test_8 \n", "4 MarkDuplicates-74536757 diffuse_large_B_cell_lymphoma_test_8 \n", "\n", " dna_seq \\\n", "0 GTG TGG GGA GAG AGT GTG TGC GCC CCG CGC GCT CT... \n", "1 GTT TTT TTC TCT CTT TTC TCT CTA TAC ACC CCT CT... \n", "2 AAA AAA AAT ATG TGA GAG AGA GAG AGA GAC ACT CT... \n", "3 GAA AAT ATG TGG GGC GCT CTT TTG TGG GGT GTC TC... \n", "4 TCG CGG GGA GAC ACT CTT TTG TGG GGT GTG TGT GT... \n", "\n", " methyl_seq dmr_ctype dmr_label ctype \n", "0 2222222212222122222122222212222222222222222222... T 5 NaN \n", "1 2222222222222222222222222222221212222222122222... T 19 NaN \n", "2 2222222222222222222222222222202222222222222222... T 2 NaN \n", "3 2222222222222222222222122222222222222202222222... T 12 NaN \n", "4 1222222222222222222222222221222122212212222222... T 19 NaN \n", "\n", "[5 rows x 23 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "pd.read_csv(\"tmp/data.csv\", sep=\"\\t\").head()" ] } ], "metadata": { "kernelspec": { "display_name": "dnabert", "language": "python", "name": "dnabert" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 5 }