#!/usr/bin/awk -f ### quad_reg.awk # quadratic regression along columns BEGIN { sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" exponent = "([Ee]" sign "[0-9]+)?" number = "^" sign "(" decimal "|" fraction ")" exponent "$" } NR == 1 { for (n=1; n<=NF; n++) ($n ~ number) ? header[n] = "col" n : header[n] = $n } NF > 0 { if (NF > nf_max) nf_max = NF ### iterate over columns for (y=1; y<=nf_max; y++) { if ($y == header[n]) continue if ($y ~ number) { ### mean count[y] += 1 sum[y] += $y sum2[y] += $y*$y sum3[y] += $y*$y*$y sum4[y] += $y*$y*$y*$y delta0[y] = $y - mean[y] mean[y] += delta0[y]/count[y] delta1[y] = $y - mean[y] delta[y] = delta1[y] delta2[y] = delta0[y]*delta[1] sum_delta[y] += delta[y] sum_delta2[y] += delta2[y] ### sample variance #(count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = "" # x = row, y = col, trendline: y = A + Bx + Cx^2 for (x=1; x<=nf_max; x++) { count[x,y] += 1 sum_xy[x,y] += $x*$y sum_x2y[x,y] += $x*$x*$y sum_delta_xy[x,y] += delta[x]*delta[y] sum_delta_xx2[x,y] += delta[x]*delta2[x] sum_delta_x2y[x,y] += delta2[x]*delta[y] sum_delta_x2x2[x,y] += delta2[x]*delta2[x] # covariances if (count[x,y] > 1) { s_xx[x,y] = sum2_delta[x]/(count[x,y] - 1) s_xy[x,y] = sum_delta_xy[x,y]/(count[x,y] - 1) s_xx2[x,y] = sum_delta_xx2[x,y]/(count[x,y] - 1) s_x2x2[x,y] = sum_delta_x2x2[x,y]/(count[x,y] - 1) s_x2y[x,y] = sum_delta_x2y[x,y]/(count[x,y] - 1) } else { s_xx[x,y] = 0 s_xy[x,y] = 0 s_xx2[x,y] = 0 s_x2x2[x,y] = 0 s_x2y[x,y] = 0 } bc_den[x,y] = (s_xx[x,y]*s_x2x2[x,y] - s_xx2[x,y]*s_xx2[x,y]) if (bc_den[x,y]) { c[x,y] = (s_x2y[x,y]*s_xx[x,y] - s_xy[x,y]*s_xx2[x,y])/bc_den[x,y] b[x,y] = (s_xy[x,y]*s_x2x2[x,y] - s_x2y[x,y]*s_xx2[x,y])/bc_den[x,y] } else { c[x,y] = 0 b[x,y] = 1 } a[x,y] = mean[y] - b[x,y]*mean[x] - c[x,y]*mean2[x] # error estimate err[x,y] = ($y - (a[x,y] + b[x,y]*$x + c[x,y]*$x*$x)) sum_err2[x,y] += err[x,y]*err[x,y] # correlation sum_delta2[y] ? r[x,y] = sqrt(1 - sum_err2[x,y]/sum_delta2[y]) : r[x,y] = 0 D[x,y] = b[x,y]*b[x,y] - 4.0*a[x,y]*c[x,y] # quadratic roots (x-intercepts) if (c[x,y]) { rx0[x,y] = (-1.0*b[x,y] - sqrt(D[x,y])/(2.0*c[x,y])) rx1[x,y] = (-1.0*b[x,y] + sqrt(D[x,y])/(2.0*c[x,y])) } # vertex of parabola if (c[x,y]) { xv[x,y] = -0.5*b[x,y]/c[x,y] yv[x,y] = -0.25*(b[x,y]*b[x,y])/c[x,y] + a[x,y] } # focus of parabola if (c[x,y]) { xf[x,y] = -0.5*b[x,y]/c[x,y] yf[x,y] = (4.0*a[x,y]*c[x,y] - b[x,y]*b[x,y] + 1.0)/(4.0*c[x,y]) } } } else continue } } END { for (x=1; x<=nf_max; x++) { for (y=1; y<=nf_max; y++) { if (x != y && r[x,y]) { printf(OFMT OFS "(%s)" OFS " = (" OFMT ")(%s)^2" OFS " + (" OFMT ")(%s)" OFS " + (" OFMT ")", (r[x,y]*r[x,y]), header[y], c[x,y], header[x], b[x,y], header[x], a[x,y]) printf(" [" OFMT "," OFMT "][" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS" [" OFMT "," OFMT "]", rx0[x,y], (a[x,y] + b[x,y]*rx0[x,y] + c[x,y]*rx0[x,y]*rx0[x,y]), rx1[x,y], (a[x,y] + b[x,y]*rx1[x,y] + c[x,y]*rx1[x,y]*rx1[x,y]), 0, a[x,y], xv[x,y], yv[x,y]) printf(" [" OFMT "," OFMT "]", xf[x,y], yf[x,y]) printf(" [" OFMT "]" ORS, sqrt((yf[x,y] - yv[x,y])*(yf[x,y] - yv[x,y]))) } } } }