-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsamplecov.sh
More file actions
executable file
·221 lines (186 loc) · 7.14 KB
/
samplecov.sh
File metadata and controls
executable file
·221 lines (186 loc) · 7.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/bin/bash -ex
# Script that computes per-sample coverage from BAM/CRAM or bedGraph files.
# It automatically extracts chromosome lists, processes coverage by chromosome,
# and merges results into a single bedGraph.gz file with a tabix index.
########################################
# Initialize default values
opt_dry_run=false # Whether to only print commands, not run them
tmp="tmp" # Prefix for temporary files
cram_files=() # Array to store BAM/CRAM input files
bedGraph_files=() # Array to store bedGraph input files
is_crams=false # Flag to indicate whether CRAM/BAM mode is active
P=16 # Default number of parallel threads
PYTHON=$(command -v pypy3 || command -v python3) # If pypy3 is installed, use it; otherwise, default to python3
ulimit -n 65536
########################################
# Define help message
print_help() {
echo "Script which computes the sample coverage from multiple BAM/CRAM or bedGraph indexed files"
echo
echo "Usage: $0 [options] <input_files>"
echo
echo "Options:"
echo " -h, --help Display this help and exit"
echo " -r <index> Reference FASTA index (.fai) or list of chromosomes/regions"
echo " -p <threads> Number of parallel threads (default: 16)"
echo " -o <sample> Output sample file name (required)"
echo " --dry-run Just print commands instead of executing"
echo
echo "input_files: sorted & indexed bam/cram/bedGraph.gz files"
echo
echo "Example:"
echo " $0 -o output.gz input1.cram input2.cram ..."
echo " $0 -r ref.ids -o output.gz input1.bam input2.bam ..."
echo " $0 -r ref.ids -o output.gz input1.gz input2.gz ..."
echo
}
########################################
# Check if external command is available
check_cmd() {
command -v "$1" >/dev/null 2>&1 || {
echo "Error: $1 not found." >&2
exit 1
}
}
########################################
# Parse command-line arguments
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help)
print_help
exit 0
;;
-p)
P="$2"
shift 2
;;
-r)
ref_index="$2"
shift 2
;;
--dry-run)
opt_dry_run=true
shift
;;
-o)
out_sample="$2"
shift 2
;;
-*)
echo "Unknown option: $1"
exit 1
;;
*)
# Determine file type based on file extension
case "$1" in
*.cram) cram_files+=("$1") ;;
*.bam) cram_files+=("$1") ;;
*.gz) bedGraph_files+=("$1") ;;
*) echo "Missing input files or unknown extension" ; exit 1 ;;
esac
# Check that the input file exists and is readable
if [[ ! -r "$1" ]] ; then
echo "Error: input file $1 does not exist or is unreadable"
exit 1
fi
shift
;;
esac
done
########################################
# Check if reference index exists if provided
if [[ -n "$ref_index" ]] && [[ ! -r "$ref_index" ]] ; then
echo "Error: reference index $ref_index does not exist or is unreadable"
exit 1
fi
########################################
# Check that -o (sample output filename) is specified
if [[ ! -n "$out_sample" ]] ; then
echo "Error: option -o must be defined; use -h for details"
exit 1
fi
# Derive prefix for temporary files based on output filename
sample_prefix="${out_sample%%.*}"
# Check if output already exists (both file and tabix index)
if [[ -s $out_sample ]] && [[ -s $out_sample.tbi ]] ; then
echo "File $out_sample already exists"
exit 0
fi
# Compose unique tmp file prefix
tmp="$tmp.$sample_prefix"
########################################
# Ensure at least one input file was provided
if [[ ${#cram_files[@]} -eq 0 ]] && [[ ${#bedGraph_files[@]} -eq 0 ]] ; then
echo "Error: No BAM/CRAM or bedGraph files provided."
exit 1
elif [[ ${#cram_files[@]} -gt 0 ]] && [[ ${#bedGraph_files[@]} -gt 0 ]] ; then
echo "Error: Either BAM/CRAM OR bedGraph files must be provided, not both."
elif [[ ${#cram_files[@]} -gt 0 ]] ; then
printf "%s\n" "${cram_files[@]}" > $tmp.files
is_crams=true
else
printf "%s\n" "${bedGraph_files[@]}" > $tmp.files
fi
########################################
# Check that required tools are installed
for cmd in python3 parallel bgzip tabix samtools readlink; do
check_cmd "$cmd"
done
########################################
# Determine path of helper scripts (assumed in same folder as this script)
script_path=$(dirname $(readlink -f $0))
########################################
# Create chromosome-to-file mapping
echo "Gathering files and chromosome ... "
if [[ -n "$ref_index" ]] ; then
# Use the reference index (.fai) to list all chromosomes,
# then compute cartesian product with file list
cut -f1 $ref_index | $script_path/cartesian_product.sh /dev/stdin $tmp.files > $tmp.chr_files
elif [[ $is_crams == "true" ]] ; then
# Extract chromosomes directly from CRAM/BAM headers
cat $tmp.files | $script_path/extract_chromosomes.sh > $tmp.chr_files
else
# For bedGraph input, list chromosomes using tabix -l
cat $tmp.files | $script_path/list_chromosomes.sh > $tmp.chr_files
fi
########################################
# Prepare coverage computation commands for each chromosome
rm -f $tmp.sh
cat $tmp.chr_files \
| $PYTHON $script_path/group_chromosomes.py \
| tee $tmp.chr_files_grouped \
| while IFS=$'\t' read -r chr files; do
# Skip chromosome if already processed and indexed
if [[ ! -s "$tmp.$chr.bedGraph.gz.tbi" ]] ; then
if [[ $is_crams == "true" ]] ; then
# Construct pipeline for BAM/CRAM files
echo -n "samtools depth $files -r $chr | $PYTHON $script_path/count_coverage.py | $PYTHON $script_path/merge_coverage.py " >> $tmp.sh
else
# Construct pipeline for bedGraph files
echo -n "$script_path/tabix.sh $files $chr | $PYTHON $script_path/sum_coverage.py " >> $tmp.sh
fi
# Compress output and index with tabix
echo "| bgzip > $tmp.$chr.bedGraph.gz ; tabix -p bed $tmp.$chr.bedGraph.gz" >> $tmp.sh
fi
done
########################################
# Run coverage computation in parallel, or show commands if dry-run
if [[ ! "$opt_dry_run" == "true" ]] ; then
# Execute all commands in parallel
echo "Running $0 in parallel ... "
cat $tmp.sh | parallel -j $P 2>/dev/null
#check all files were generated
cat $tmp.sh | awk '{print $NF}' | xargs ls > /dev/null
cat $tmp.sh | awk '{print $NF}' | sed 's|$|.tbi|' | xargs ls > /dev/null
# Concatenate all per-chromosome bedGraph files into final output
ls $tmp.*.bedGraph.gz | sort -f | xargs cat > $out_sample
tabix -p bed $out_sample
# Clean up temporary files
rm $tmp.*
echo "Done"
else
# Print commands for inspection, but do not execute
cat $tmp.sh
echo "ls $tmp.*.bedGraph.gz | sort -f | xargs cat > $out_sample ; tabix -p bed $out_sample"
rm $tmp.sh
fi