diff options
| author | wukong <wukong@longaeva> | 2019-04-13 13:04:26 -0700 |
|---|---|---|
| committer | wukong <wukong@longaeva> | 2019-04-13 13:04:26 -0700 |
| commit | 0e225c6d140a5a4986d0771aefb8a8edbe5d435d (patch) | |
| tree | 78d197f2e6c972e9483c95ee1154bee2b76d15c4 | |
| parent | 187ab8368c39a3fd459d5715c91a71104413299a (diff) | |
optimized covariance script;
added 'nan' output to diff;
added ':' output delimiter to lin_reg and quad_reg;
fixed broken r^2 calc in quad_reg;
Diffstat (limited to '')
| -rw-r--r-- | cov.awk | 12 | ||||
| -rw-r--r-- | diff.awk | 2 | ||||
| -rw-r--r-- | lin_reg.awk | 9 | ||||
| -rw-r--r-- | mean.awk | 2 | ||||
| -rw-r--r-- | mean_avg.awk | 2 | ||||
| -rw-r--r-- | quad_reg.awk | 74 |
6 files changed, 54 insertions, 47 deletions
@@ -4,6 +4,7 @@ # online covariance algorithm BEGIN { + OFS = FS sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -22,16 +23,17 @@ NF > 0 { ### columns for (y=1; y<=nf_max; y++) { - if ($y == header[y]) - continue ### rows for (x=1; x<=nf_max; x++) { + if ($x !~ number || $x == header[x]) + continue count[x,y]++ dx[x,y] = $x - meanx[x,y] + dy[x,y] = $y - meany[x,y] meanx[x,y] += dx[x,y]/count[x,y] - meany[x,y] += ($y - meany[x,y])/count[x,y] - C[x,y] += dx[x,y]*($y - meany[x,y]) - cov_pop[x,y] = C[x,y]/count[x,y] + meany[x,y] += dy[x,y]/count[x,y] + C[x,y] += dx[x,y]*dy[x,y] + # cov_pop[x,y] = C[x,y]/count[x,y] (count[x,y] > 1) ? cov_samp[x,y] = C[x,y]/(count[x,y] - 1) : cov_samp[x,y] = "" } } @@ -55,7 +55,7 @@ NF { continue if ($y ~ number) { data[y] = $y - (data_prev[y] ~ number) ? diff[y] = data[y] - data_prev[y] : diff[y] = "" + (data_prev[y] ~ number) ? diff[y] = data[y] - data_prev[y] : diff[y] = "nan" data_prev[y] = data[y] printf(OFMT, diff[y]) } diff --git a/lin_reg.awk b/lin_reg.awk index aeb5dfc..52e55d3 100644 --- a/lin_reg.awk +++ b/lin_reg.awk @@ -4,6 +4,7 @@ # simple linear regression between columns BEGIN { + OFS = ":" sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -91,15 +92,15 @@ END { for (y=1; y<=nf_max; y++) { for (x=1; x<=nf_max; x++) { if (x != y && r[x,y]) { - printf(OFMT OFS "(%s)" OFS " = (" OFMT " +/- " OFMT ")(%s)" OFS " + (" OFMT " +/- " OFMT ")", + printf(OFMT OFS "(%s)" OFS " = (" OFMT " +/- " OFMT ")(%s) + (" OFMT " +/- " OFMT ")" OFS, (r[x,y]*r[x,y]), header[y], b[x,y], b_err[x,y], header[x], a[x,y], a_err[x,y]) - printf(" [" OFMT "," OFMT "][" OFMT "," OFMT "] [" OFMT "," OFMT "]", + printf("[" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS "[" OFMT "," OFMT "]" OFS, 0, a[x,y], (-1.0*a[x,y]/b[x,y]), 0, mean[x], b[x,y]*(mean[x]) + a[x,y]) - printf(" [" OFMT "," OFMT "]", xw[x,y], yw[x,y]) - printf(" [" OFMT "]" ORS, sqrt(xw_dist[x,y]*xw_dist[x,y] + yw_dist[x,y]*yw_dist[x,y])) + printf("[" OFMT "," OFMT "]" OFS, xw[x,y], yw[x,y]) + printf("[" OFMT "]" ORS, sqrt(xw_dist[x,y]*xw_dist[x,y] + yw_dist[x,y]*yw_dist[x,y])) } } } @@ -1,7 +1,7 @@ #!/usr/bin/awk -f
### mean.awk
-# calculate mean average of serialized numbers
+# calculate mean average of serialized input
BEGIN {
OFS = FS
diff --git a/mean_avg.awk b/mean_avg.awk index dc39118..e4596b0 100644 --- a/mean_avg.awk +++ b/mean_avg.awk @@ -2,6 +2,8 @@ ### mean_avg.awk # average columns of numerical data +# input: delimited data as text +# output: list of univariate summary stats BEGIN { OFS = FS diff --git a/quad_reg.awk b/quad_reg.awk index a587a3a..edd0c6f 100644 --- a/quad_reg.awk +++ b/quad_reg.awk @@ -4,6 +4,8 @@ # quadratic regression along columns BEGIN { + OFS = ":" + pi = 4.0*atan2(1.0, 1.0) sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -34,10 +36,10 @@ NF > 0 { sum4[y] += $y*$y*$y*$y delta0[y] = $y - mean[y] mean[y] += delta0[y]/count[y] + mean2[y] = sum2[y]/count[y] delta1[y] = $y - mean[y] - delta[y] = delta1[y] - delta2[y] = delta0[y]*delta[1] - sum_delta[y] += delta[y] + delta2[y] = delta0[y]*delta1[y] + sum_delta[y] += delta1[y] sum_delta2[y] += delta2[y] ### sample variance @@ -45,21 +47,17 @@ NF > 0 { # x = row, y = col, trendline: y = A + Bx + Cx^2 for (x=1; x<=nf_max; x++) { - count[x,y] += 1 + count_xy[x,y] += 1 sum_xy[x,y] += $x*$y sum_x2y[x,y] += $x*$x*$y - sum_delta_xy[x,y] += delta[x]*delta[y] - sum_delta_xx2[x,y] += delta[x]*delta2[x] - sum_delta_x2y[x,y] += delta2[x]*delta[y] - sum_delta_x2x2[x,y] += delta2[x]*delta2[x] # covariances - if (count[x,y] > 1) { - s_xx[x,y] = sum2_delta[x]/(count[x,y] - 1) - s_xy[x,y] = sum_delta_xy[x,y]/(count[x,y] - 1) - s_xx2[x,y] = sum_delta_xx2[x,y]/(count[x,y] - 1) - s_x2x2[x,y] = sum_delta_x2x2[x,y]/(count[x,y] - 1) - s_x2y[x,y] = sum_delta_x2y[x,y]/(count[x,y] - 1) + if (count_xy[x,y] > 0) { + s_xx[x,y] = sum2[x]/count_xy[x,y] - mean[x]*mean[x] + s_xy[x,y] = sum_xy[x,y]/count_xy[x,y] - mean[x]*mean[y] + s_xx2[x,y] = sum3[x]/count_xy[x,y] - mean[x]*mean2[x] + s_x2x2[x,y] = sum4[x]/count_xy[x,y] - mean2[x]*mean2[x] + s_x2y[x,y] = sum_x2y[x,y]/count_xy[x,y] - mean2[x]*mean[y] } else { s_xx[x,y] = 0 @@ -70,39 +68,43 @@ NF > 0 { } bc_den[x,y] = (s_xx[x,y]*s_x2x2[x,y] - s_xx2[x,y]*s_xx2[x,y]) - if (bc_den[x,y]) { + if (bc_den[x,y] != 0) { c[x,y] = (s_x2y[x,y]*s_xx[x,y] - s_xy[x,y]*s_xx2[x,y])/bc_den[x,y] b[x,y] = (s_xy[x,y]*s_x2x2[x,y] - s_x2y[x,y]*s_xx2[x,y])/bc_den[x,y] } else { c[x,y] = 0 - b[x,y] = 1 + b[x,y] = 0 } a[x,y] = mean[y] - b[x,y]*mean[x] - c[x,y]*mean2[x] # error estimate - err[x,y] = ($y - (a[x,y] + b[x,y]*$x + c[x,y]*$x*$x)) + err[x,y] = $y - a[x,y] - b[x,y]*$x - c[x,y]*$x*$x sum_err2[x,y] += err[x,y]*err[x,y] # correlation - sum_delta2[y] ? r[x,y] = sqrt(1 - sum_err2[x,y]/sum_delta2[y]) : r[x,y] = 0 - - D[x,y] = b[x,y]*b[x,y] - 4.0*a[x,y]*c[x,y] + sum_delta2[y] ? r2[x,y] = 1 - sum_err2[x,y]/sum_delta2[y] : r2[x,y] = 0 + #print a[x,y],b[x,y],c[x,y],err[x,y],sum_err2[x,y] # quadratic roots (x-intercepts) - if (c[x,y]) { - rx0[x,y] = (-1.0*b[x,y] - sqrt(D[x,y])/(2.0*c[x,y])) - rx1[x,y] = (-1.0*b[x,y] + sqrt(D[x,y])/(2.0*c[x,y])) - } - - # vertex of parabola - if (c[x,y]) { + if (c[x,y] != 0) { + D[x,y] = b[x,y]*b[x,y] - 4.0*a[x,y]*c[x,y] + if (D[x,y] >= 0) { + rx0[x,y] = 0.5*(-1.0*b[x,y] - sqrt(D[x,y]))/c[x,y] + rx1[x,y] = 0.5*(-1.0*b[x,y] + sqrt(D[x,y]))/c[x,y] + } + if (D[x,y] < 0) { + rx_real[x,y] = -0.5*b[x,y]/c[x,y] + rx_imag[x,y] = 0.5*sqrt(-1.0*D[x,y])/c[x,y] + rx0[x,y] = rx_real[x,y] + rx1[x,y] = rx_real[x,y] + } + + # vertex of parabola xv[x,y] = -0.5*b[x,y]/c[x,y] yv[x,y] = -0.25*(b[x,y]*b[x,y])/c[x,y] + a[x,y] - } - # focus of parabola - if (c[x,y]) { + # focus of parabola xf[x,y] = -0.5*b[x,y]/c[x,y] yf[x,y] = (4.0*a[x,y]*c[x,y] - b[x,y]*b[x,y] + 1.0)/(4.0*c[x,y]) } @@ -116,16 +118,16 @@ NF > 0 { END { for (x=1; x<=nf_max; x++) { for (y=1; y<=nf_max; y++) { - if (x != y && r[x,y]) { - printf(OFMT OFS "(%s)" OFS " = (" OFMT ")(%s)^2" OFS " + (" OFMT ")(%s)" OFS " + (" OFMT ")", - (r[x,y]*r[x,y]), header[y], c[x,y], header[x], b[x,y], header[x], a[x,y]) - printf(" [" OFMT "," OFMT "][" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS" [" OFMT "," OFMT "]", + if (x != y) { + printf(OFMT OFS "(%s)" OFS "= (" OFMT ")(%s)^2 + (" OFMT ")(%s) + (" OFMT ")" OFS, + r2[x,y], header[y], c[x,y], header[x], b[x,y], header[x], a[x,y]) + printf("[" OFMT "," OFMT "][" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS "[" OFMT "," OFMT "]", rx0[x,y], (a[x,y] + b[x,y]*rx0[x,y] + c[x,y]*rx0[x,y]*rx0[x,y]), rx1[x,y], (a[x,y] + b[x,y]*rx1[x,y] + c[x,y]*rx1[x,y]*rx1[x,y]), 0, a[x,y], xv[x,y], yv[x,y]) - printf(" [" OFMT "," OFMT "]", xf[x,y], yf[x,y]) - printf(" [" OFMT "]" ORS, sqrt((yf[x,y] - yv[x,y])*(yf[x,y] - yv[x,y]))) + printf(OFS "[" OFMT "," OFMT "]", xf[x,y], yf[x,y]) + printf(OFS "[" OFMT "]" ORS, (yf[x,y] - yv[x,y])) } } } |
