awk to find overlaps -
i have file columns shown below.
group start end chr1 117132092 118875009 chr1 117027758 119458215 chr1 103756473 104864582 chr1 105093795 106219211 chr1 103354114 104747251 chr1 102741437 105235140 chr1 100090254 101094139 chr1 100426977 101614730 chr2 86644663 87767193 chr2 82473711 83636545 chr2 83896702 85079032 chr2 83876122 85091910 chr2 82943211 84350917 chr3 89410051 90485635 chr3 89405753 90485635 chr3 86491492 87593215 chr3 82507157 83738004 chr3 85059618 86362254
i find overlap between coordinates in each group(grouped chr1,chr2,chr3..).
the start , end coordinates has checked if there atleast 50% overlap others in same group. if there atleast 50% overlap, new start , end coordinates has reported in columns 3 , 4 (which range of overlap region). if don't overlap has report original start , end in columns 3 , 4.
to make more clear, lets take first 2 rows
117132092..........118875009 117027758...........................119458215
since both of them overlap atleast 50% each other, range of overlap reported new start , new end in output. , row 3 , 4 doesn't overlap others , original coordinates reported new start , new end in column 3 , 4. , again since rows 5 , 6 have 50% overlap each other range reported new start , new end in column 3 , 4. here expected output:
group start end newstart newend chr1 117132092 118875009 117027758 119458215 chr1 117027758 119458215 117027758 119458215 chr1 103756473 104864582 103354114 104864582 chr1 105093795 106219211 105093795 106219211 chr1 103354114 104747251 102741437 105235140 chr1 102741437 105235140 102741437 105235140 chr1 100090254 101094139 100090254 101614730 chr1 100426977 101614730 100090254 101614730 chr2 86644663 87767193 86644663 87767193 chr2 82473711 83636545 82473711 83636545 chr2 83896702 85079032 83876122 85091910 chr2 83876122 85091910 83876122 85091910 chr2 82943211 84350917 82943211 84350917 chr3 89410051 90485635 89405753 90485635 chr3 89405753 90485635 89405753 90485635 chr3 86491492 87593215 86491492 87593215 chr3 82507157 83738004 82507157 83738004 chr3 85059618 86362254 85059618 86362254
i have achieved in r programming language original file huge , take long time run. in awk.
using gnu awk version 4, try:
gawk -f a.awk file file
where a.awk
is:
nr==fnr { if (fnr>1) { a[$1][++i]=$2 b[$1][i]=$3 } next } fnr==1 { fmt="%-7s%-10s%-10s%-10s%-10s\n" printf fmt,"group","start","end","newstart","newend" } fnr>1{ $4=$2; $5=$3 n=checkinside($1,$2,$3) if (n>0) { ff=0; x=$2; y=$3 (i=1; i<=n; i++) { ar=a[$1][r[i]]; br=b[$1][r[i]]; getintersect($2,$3,ar,br) getlargest($2,$3,ar,br) ovl=((i2-i1)/($3-$2))*100; ovr=((i2-i1)/(br-ar))*100; if (ovl>50 && ovr>50) { if (r1<x) x=r1 if (r2>y) y=r2 ff=1 } } if (ff) { $4=x; $5=y } } printf fmt,$1,$2,$3,$4,$5 } function getlargest(x1,y1,x2,y2) { r1=(x1<=x2)?x1:x2 r2=(y1>=y2)?y1:y2 } function getintersect(x1,y1,x2,y2) { if (x1>=x2 && x1<=y2) { i1=x1; } else { i1=x2; } i2=(y1<=y2)?y1:y2 } function checkinside(g,x,y,i,j,x1,y1) { r["x"]=0 (i in a[g]) { x1=a[g][i]; y1=b[g][i]; if ((x>=x1 && x<=y1) || (y>=x1 && y<=y1)) { if (!(x==x1 && y==y1)) r[++j]=i } } return j }
output:
group start end newstart newend chr1 117132092 118875009 117027758 119458215 chr1 117027758 119458215 117027758 119458215 chr1 103756473 104864582 103354114 104864582 chr1 105093795 106219211 105093795 106219211 chr1 103354114 104747251 102741437 105235140 chr1 102741437 105235140 102741437 105235140 chr1 100090254 101094139 100090254 101614730 chr1 100426977 101614730 100090254 101614730 chr2 86644663 87767193 86644663 87767193 chr2 82473711 83636545 82473711 83636545 chr2 83896702 85079032 83876122 85091910 chr2 83876122 85091910 83876122 85091910 chr2 82943211 84350917 82943211 84350917 chr3 89410051 90485635 89405753 90485635 chr3 89405753 90485635 89405753 90485635 chr3 86491492 87593215 86491492 87593215 chr3 82507157 83738004 82507157 83738004 chr3 85059618 86362254 85059618 86362254
Comments
Post a Comment