summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--cov.awk12
-rw-r--r--diff.awk2
-rw-r--r--lin_reg.awk9
-rw-r--r--mean.awk2
-rw-r--r--mean_avg.awk2
-rw-r--r--quad_reg.awk74
6 files changed, 54 insertions, 47 deletions
diff --git a/cov.awk b/cov.awk
index 56734e4..c06a447 100644
--- a/cov.awk
+++ b/cov.awk
@@ -4,6 +4,7 @@
# online covariance algorithm
BEGIN {
+ OFS = FS
sign = "[+-]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -22,16 +23,17 @@ NF > 0 {
### columns
for (y=1; y<=nf_max; y++) {
- if ($y == header[y])
- continue
### rows
for (x=1; x<=nf_max; x++) {
+ if ($x !~ number || $x == header[x])
+ continue
count[x,y]++
dx[x,y] = $x - meanx[x,y]
+ dy[x,y] = $y - meany[x,y]
meanx[x,y] += dx[x,y]/count[x,y]
- meany[x,y] += ($y - meany[x,y])/count[x,y]
- C[x,y] += dx[x,y]*($y - meany[x,y])
- cov_pop[x,y] = C[x,y]/count[x,y]
+ meany[x,y] += dy[x,y]/count[x,y]
+ C[x,y] += dx[x,y]*dy[x,y]
+ # cov_pop[x,y] = C[x,y]/count[x,y]
(count[x,y] > 1) ? cov_samp[x,y] = C[x,y]/(count[x,y] - 1) : cov_samp[x,y] = ""
}
}
diff --git a/diff.awk b/diff.awk
index 721aac6..d39b654 100644
--- a/diff.awk
+++ b/diff.awk
@@ -55,7 +55,7 @@ NF {
continue
if ($y ~ number) {
data[y] = $y
- (data_prev[y] ~ number) ? diff[y] = data[y] - data_prev[y] : diff[y] = ""
+ (data_prev[y] ~ number) ? diff[y] = data[y] - data_prev[y] : diff[y] = "nan"
data_prev[y] = data[y]
printf(OFMT, diff[y])
}
diff --git a/lin_reg.awk b/lin_reg.awk
index aeb5dfc..52e55d3 100644
--- a/lin_reg.awk
+++ b/lin_reg.awk
@@ -4,6 +4,7 @@
# simple linear regression between columns
BEGIN {
+ OFS = ":"
sign = "[+-]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -91,15 +92,15 @@ END {
for (y=1; y<=nf_max; y++) {
for (x=1; x<=nf_max; x++) {
if (x != y && r[x,y]) {
- printf(OFMT OFS "(%s)" OFS " = (" OFMT " +/- " OFMT ")(%s)" OFS " + (" OFMT " +/- " OFMT ")",
+ printf(OFMT OFS "(%s)" OFS " = (" OFMT " +/- " OFMT ")(%s) + (" OFMT " +/- " OFMT ")" OFS,
(r[x,y]*r[x,y]),
header[y], b[x,y], b_err[x,y],
header[x], a[x,y], a_err[x,y])
- printf(" [" OFMT "," OFMT "][" OFMT "," OFMT "] [" OFMT "," OFMT "]",
+ printf("[" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS "[" OFMT "," OFMT "]" OFS,
0, a[x,y], (-1.0*a[x,y]/b[x,y]), 0,
mean[x], b[x,y]*(mean[x]) + a[x,y])
- printf(" [" OFMT "," OFMT "]", xw[x,y], yw[x,y])
- printf(" [" OFMT "]" ORS, sqrt(xw_dist[x,y]*xw_dist[x,y] + yw_dist[x,y]*yw_dist[x,y]))
+ printf("[" OFMT "," OFMT "]" OFS, xw[x,y], yw[x,y])
+ printf("[" OFMT "]" ORS, sqrt(xw_dist[x,y]*xw_dist[x,y] + yw_dist[x,y]*yw_dist[x,y]))
}
}
}
diff --git a/mean.awk b/mean.awk
index 3936902..754acfe 100644
--- a/mean.awk
+++ b/mean.awk
@@ -1,7 +1,7 @@
#!/usr/bin/awk -f
### mean.awk
-# calculate mean average of serialized numbers
+# calculate mean average of serialized input
BEGIN {
OFS = FS
diff --git a/mean_avg.awk b/mean_avg.awk
index dc39118..e4596b0 100644
--- a/mean_avg.awk
+++ b/mean_avg.awk
@@ -2,6 +2,8 @@
### mean_avg.awk
# average columns of numerical data
+# input: delimited data as text
+# output: list of univariate summary stats
BEGIN {
OFS = FS
diff --git a/quad_reg.awk b/quad_reg.awk
index a587a3a..edd0c6f 100644
--- a/quad_reg.awk
+++ b/quad_reg.awk
@@ -4,6 +4,8 @@
# quadratic regression along columns
BEGIN {
+ OFS = ":"
+ pi = 4.0*atan2(1.0, 1.0)
sign = "[+-]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -34,10 +36,10 @@ NF > 0 {
sum4[y] += $y*$y*$y*$y
delta0[y] = $y - mean[y]
mean[y] += delta0[y]/count[y]
+ mean2[y] = sum2[y]/count[y]
delta1[y] = $y - mean[y]
- delta[y] = delta1[y]
- delta2[y] = delta0[y]*delta[1]
- sum_delta[y] += delta[y]
+ delta2[y] = delta0[y]*delta1[y]
+ sum_delta[y] += delta1[y]
sum_delta2[y] += delta2[y]
### sample variance
@@ -45,21 +47,17 @@ NF > 0 {
# x = row, y = col, trendline: y = A + Bx + Cx^2
for (x=1; x<=nf_max; x++) {
- count[x,y] += 1
+ count_xy[x,y] += 1
sum_xy[x,y] += $x*$y
sum_x2y[x,y] += $x*$x*$y
- sum_delta_xy[x,y] += delta[x]*delta[y]
- sum_delta_xx2[x,y] += delta[x]*delta2[x]
- sum_delta_x2y[x,y] += delta2[x]*delta[y]
- sum_delta_x2x2[x,y] += delta2[x]*delta2[x]
# covariances
- if (count[x,y] > 1) {
- s_xx[x,y] = sum2_delta[x]/(count[x,y] - 1)
- s_xy[x,y] = sum_delta_xy[x,y]/(count[x,y] - 1)
- s_xx2[x,y] = sum_delta_xx2[x,y]/(count[x,y] - 1)
- s_x2x2[x,y] = sum_delta_x2x2[x,y]/(count[x,y] - 1)
- s_x2y[x,y] = sum_delta_x2y[x,y]/(count[x,y] - 1)
+ if (count_xy[x,y] > 0) {
+ s_xx[x,y] = sum2[x]/count_xy[x,y] - mean[x]*mean[x]
+ s_xy[x,y] = sum_xy[x,y]/count_xy[x,y] - mean[x]*mean[y]
+ s_xx2[x,y] = sum3[x]/count_xy[x,y] - mean[x]*mean2[x]
+ s_x2x2[x,y] = sum4[x]/count_xy[x,y] - mean2[x]*mean2[x]
+ s_x2y[x,y] = sum_x2y[x,y]/count_xy[x,y] - mean2[x]*mean[y]
}
else {
s_xx[x,y] = 0
@@ -70,39 +68,43 @@ NF > 0 {
}
bc_den[x,y] = (s_xx[x,y]*s_x2x2[x,y] - s_xx2[x,y]*s_xx2[x,y])
- if (bc_den[x,y]) {
+ if (bc_den[x,y] != 0) {
c[x,y] = (s_x2y[x,y]*s_xx[x,y] - s_xy[x,y]*s_xx2[x,y])/bc_den[x,y]
b[x,y] = (s_xy[x,y]*s_x2x2[x,y] - s_x2y[x,y]*s_xx2[x,y])/bc_den[x,y]
}
else {
c[x,y] = 0
- b[x,y] = 1
+ b[x,y] = 0
}
a[x,y] = mean[y] - b[x,y]*mean[x] - c[x,y]*mean2[x]
# error estimate
- err[x,y] = ($y - (a[x,y] + b[x,y]*$x + c[x,y]*$x*$x))
+ err[x,y] = $y - a[x,y] - b[x,y]*$x - c[x,y]*$x*$x
sum_err2[x,y] += err[x,y]*err[x,y]
# correlation
- sum_delta2[y] ? r[x,y] = sqrt(1 - sum_err2[x,y]/sum_delta2[y]) : r[x,y] = 0
-
- D[x,y] = b[x,y]*b[x,y] - 4.0*a[x,y]*c[x,y]
+ sum_delta2[y] ? r2[x,y] = 1 - sum_err2[x,y]/sum_delta2[y] : r2[x,y] = 0
+ #print a[x,y],b[x,y],c[x,y],err[x,y],sum_err2[x,y]
# quadratic roots (x-intercepts)
- if (c[x,y]) {
- rx0[x,y] = (-1.0*b[x,y] - sqrt(D[x,y])/(2.0*c[x,y]))
- rx1[x,y] = (-1.0*b[x,y] + sqrt(D[x,y])/(2.0*c[x,y]))
- }
-
- # vertex of parabola
- if (c[x,y]) {
+ if (c[x,y] != 0) {
+ D[x,y] = b[x,y]*b[x,y] - 4.0*a[x,y]*c[x,y]
+ if (D[x,y] >= 0) {
+ rx0[x,y] = 0.5*(-1.0*b[x,y] - sqrt(D[x,y]))/c[x,y]
+ rx1[x,y] = 0.5*(-1.0*b[x,y] + sqrt(D[x,y]))/c[x,y]
+ }
+ if (D[x,y] < 0) {
+ rx_real[x,y] = -0.5*b[x,y]/c[x,y]
+ rx_imag[x,y] = 0.5*sqrt(-1.0*D[x,y])/c[x,y]
+ rx0[x,y] = rx_real[x,y]
+ rx1[x,y] = rx_real[x,y]
+ }
+
+ # vertex of parabola
xv[x,y] = -0.5*b[x,y]/c[x,y]
yv[x,y] = -0.25*(b[x,y]*b[x,y])/c[x,y] + a[x,y]
- }
- # focus of parabola
- if (c[x,y]) {
+ # focus of parabola
xf[x,y] = -0.5*b[x,y]/c[x,y]
yf[x,y] = (4.0*a[x,y]*c[x,y] - b[x,y]*b[x,y] + 1.0)/(4.0*c[x,y])
}
@@ -116,16 +118,16 @@ NF > 0 {
END {
for (x=1; x<=nf_max; x++) {
for (y=1; y<=nf_max; y++) {
- if (x != y && r[x,y]) {
- printf(OFMT OFS "(%s)" OFS " = (" OFMT ")(%s)^2" OFS " + (" OFMT ")(%s)" OFS " + (" OFMT ")",
- (r[x,y]*r[x,y]), header[y], c[x,y], header[x], b[x,y], header[x], a[x,y])
- printf(" [" OFMT "," OFMT "][" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS" [" OFMT "," OFMT "]",
+ if (x != y) {
+ printf(OFMT OFS "(%s)" OFS "= (" OFMT ")(%s)^2 + (" OFMT ")(%s) + (" OFMT ")" OFS,
+ r2[x,y], header[y], c[x,y], header[x], b[x,y], header[x], a[x,y])
+ printf("[" OFMT "," OFMT "][" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS "[" OFMT "," OFMT "]",
rx0[x,y], (a[x,y] + b[x,y]*rx0[x,y] + c[x,y]*rx0[x,y]*rx0[x,y]),
rx1[x,y], (a[x,y] + b[x,y]*rx1[x,y] + c[x,y]*rx1[x,y]*rx1[x,y]),
0, a[x,y],
xv[x,y], yv[x,y])
- printf(" [" OFMT "," OFMT "]", xf[x,y], yf[x,y])
- printf(" [" OFMT "]" ORS, sqrt((yf[x,y] - yv[x,y])*(yf[x,y] - yv[x,y])))
+ printf(OFS "[" OFMT "," OFMT "]", xf[x,y], yf[x,y])
+ printf(OFS "[" OFMT "]" ORS, (yf[x,y] - yv[x,y]))
}
}
}