-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge_coverage.py
More file actions
executable file
·72 lines (56 loc) · 1.94 KB
/
merge_coverage.py
File metadata and controls
executable file
·72 lines (56 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env pypy3
"""
Script that merges consecutive genomic intervals that have identical coverage values.
Input format:
A tab-separated file with four columns per line:
chrom start end coverage
where:
- chrom: chromosome name (string)
- start: interval start position (integer, zero-based, inclusive)
- end: interval end position (integer, zero-based, exclusive)
- coverage: coverage value (integer or float)
Output format:
Same as input: tab-separated with columns
chrom start end coverage
Merging criteria:
- Intervals are on the same chromosome
- Coverage values are equal
- Intervals are directly adjacent (the end coordinate of one equals the start coordinate of the next)
Usage:
cat intervals.bedGraph | merge_intervals.py > merged_intervals.bedGraph
Example:
Input file (intervals.bedGraph):
chr1 10 20 5
chr1 20 30 5
chr1 30 40 3
chr2 5 15 2
chr2 15 25 2
Command:
cat intervals.bedGraph | python merge_intervals.py
Output:
chr1 10 30 5
chr1 30 40 3
chr2 5 25 2
"""
##########################
import sys
import signal
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
def merge_coverage(file):
prev = []
for lineno, line in enumerate(file, 1):
fields = line.strip().split('\t')
if len(fields) != 4:
sys.stderr.write(f"Error on line {lineno}: expected 4 fields, got {len(fields)} → {line}")
sys.exit(1)
if not prev:
prev = fields
elif prev[0] != fields[0] or prev[2] != fields[1] or prev[3] != fields[3]:
print('\t'.join(prev))
prev = fields
else:
prev[2] = fields[2] # Extend end position
if prev:
print('\t'.join(prev))
if __name__ == "__main__":
merge_coverage(sys.stdin)