summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwukong <wukong@longaeva>2018-06-17 16:38:38 -0700
committerwukong <wukong@longaeva>2018-06-17 16:44:59 -0700
commit2482727a6902e44e6a68236f878f5f9bf7947bd2 (patch)
tree1849d67ab3f730d4ddecbfbe5486d618ff3ac5ec
parent4916e9b13765de970deff094abb3eb50c663834a (diff)
added covariance matrix calculation
Diffstat (limited to '')
-rw-r--r--cov.awk61
-rw-r--r--lin_reg.awk34
-rw-r--r--[-rwxr-xr-x]lin_reg1.awk11
-rw-r--r--lin_reg2.awk12
-rw-r--r--mean.awk3
-rw-r--r--mean_avg.awk27
6 files changed, 102 insertions, 46 deletions
diff --git a/cov.awk b/cov.awk
new file mode 100644
index 0000000..176fd96
--- /dev/null
+++ b/cov.awk
@@ -0,0 +1,61 @@
+#!/usr/bin/awk -f
+
+### cov.awk
+# online covariance algorithm
+
+BEGIN {
+ OFMT = "%.18g"
+ sign = "[+-]?"
+ decimal = "[0-9]+[.]?[0-9]*"
+ fraction = "[.][0-9]*"
+ exponent = "([Ee]" sign "[0-9]+)?"
+ number = "^" sign "(" decimal "|" fraction ")" exponent "$"
+}
+
+NR == 1 {
+ for (y=1; y<=NF; y++)
+ ($y ~ number) ? header[y] = "col" y : header[y] = $y
+ printf(header[y])
+}
+
+NF > 0 {
+ if (NF > nf_max)
+ nf_max = NF
+
+ ### columns
+ for (y=1; y<=nf_max; y++) {
+ if ($y == header[y])
+ continue
+ ### rows
+ for (x=1; x<=nf_max; x++) {
+ count[x,y]++
+ dx[x,y] = $x - meanx[x,y]
+ meanx[x,y] += dx[x,y]/count[x,y]
+ meany[x,y] += ($y - meany[x,y])/count[x,y]
+ C[x,y] += dx[x,y]*($y - meany[x,y])
+ cov_pop[x,y] = C[x,y]/count[x,y]
+ (count[x,y] > 1) ? cov_samp[x,y] = C[x,y]/(count[x,y] - 1) : cov_samp[x,y] = ""
+ }
+ }
+}
+
+END {
+ ### column headers
+ printf("cov")
+ for (y=1; y<=nf_max; y++) {
+ printf(OFS header[y])
+ }
+ printf(ORS)
+
+ ### columns
+ for (y=1; y<=nf_max; y++) {
+ printf(header[y] OFS)
+ ### rows
+ for (x=1; x<=nf_max; x++) {
+ printf("%.18g", cov_samp[x,y])
+ if (x < nf_max)
+ printf(OFS)
+ }
+ printf(ORS)
+ }
+}
diff --git a/lin_reg.awk b/lin_reg.awk
index 8622289..56114a2 100644
--- a/lin_reg.awk
+++ b/lin_reg.awk
@@ -4,7 +4,7 @@
# simple linear regression between columns
BEGIN {
- OFS = "%.18g"
+ OFMT = "%.18g"
sign = "[+-]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -13,13 +13,11 @@ BEGIN {
}
NR == 1 {
- header_nf = NF
- for (n=1; n<=NF; n++) {
- ($n !~ number) ? header[n] = $n : header[n] = "col" n
- }
+ for (n=1; n<=NF; n++)
+ ($n ~ number) ? header[n] = "col" n : header[n] = $n
}
-NF != 0 {
+NF > 0 {
if (NF > nf_max)
nf_max = NF
@@ -31,15 +29,14 @@ NF != 0 {
count[y] += 1
sum[y] += $y
sum2[y] += $y*$y
- mean[y] = sum[y]/count[y]
-
- ### difference from the mean
- delta[y] = $y - mean[y]
- sum_delta[y] += delta[y]
- sum_delta2[y] += delta[y]*delta[y]
+ delta0[y] = $y - mean[y]
+ mean[y] = mean[y] + delta0[y]/count[y]
+ delta1[y] = $y - mean[y]
+ sum_delta[y] += delta1[y]
+ sum_delta2[y] += delta0[y]*delta1[y]
### sample variance
- (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = 0
+ (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = ""
# x = row, y = col
for (x=1; x<=nf_max; x++) {
@@ -47,9 +44,12 @@ NF != 0 {
sum_xy[x,y] += $x*$y
sum_delta_xy[x,y] += delta[x]*delta[y]
+ # covariance
+ #(count[x,y] > 1) ? cov[x,y] = sum_delta_xy[x,y]/(count[x,y] - 1) : cov[x,y] = ""
+
# correlation
r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y])
- (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 1
+ (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 1
ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x])
if (ab_den[x,y]) {
@@ -84,9 +84,9 @@ END {
for (y=1; y<=nf_max; y++) {
for (x=1; x<=nf_max; x++) {
if (x != y && r[x,y]) {
- printf("\n %.18g \t (%s) \t = (%.18g +/- %.18g)(%s) \t + (%.18g +/- %.18g)",
- 10.0*log(r[x,y]*r[x,y])/log(10), header[y], b[x,y], b_err[x,y], header[x],
- a[x,y], a_err[x,y])
+ printf("%.9g "OFS" (%s) "OFS" = (%.9g +/- %.9g)(%s) "OFS" + (%.9g +/- %.9g)%s",
+ 10.0*log(r[x,y]*r[x,y])/log(10.0), header[y], b[x,y],
+ b_err[x,y], header[x], a[x,y], a_err[x,y], ORS)
}
}
}
diff --git a/lin_reg1.awk b/lin_reg1.awk
index cf3e69a..8b03e2f 100755..100644
--- a/lin_reg1.awk
+++ b/lin_reg1.awk
@@ -4,7 +4,7 @@
# simple linear regression between columns
BEGIN {
- OFS = "%.18g"
+ OFMT = "%.18g"
sign = "[+-]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -13,13 +13,11 @@ BEGIN {
}
NR == 1 {
- header_nf = NF
- for (n=1; n<=NF; n++) {
- ($n !~ number) ? header[n] = $n : header[n] = "col" n
- }
+ for (n=1; n<=NF; n++)
+ ($n ~ number) ? header[n] = "col" n : header[n] = $n
}
-NF != 0 {
+NF > 0 {
if (NF > nf_max)
nf_max = NF
@@ -44,6 +42,7 @@ NF != 0 {
sum_xy[x,y] += $x*$y
sum_delta_xy[x,y] += delta[x]*delta[y]
+ # correlation
r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y])
(r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 0
diff --git a/lin_reg2.awk b/lin_reg2.awk
index dea52b5..7c00fe8 100644
--- a/lin_reg2.awk
+++ b/lin_reg2.awk
@@ -4,7 +4,7 @@
# simple linear regression between columns
BEGIN {
- OFS = "%.18g"
+ OFMT = "%.18g"
sign = "[+-]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -13,13 +13,11 @@ BEGIN {
}
NR == 1 {
- header_nf = NF
- for (n=1; n<=NF; n++) {
- ($n !~ number) ? header[n] = $n : header[n] = "col" n
- }
+ for (n=1; n<=NF; n++)
+ ($n ~ number) ? header[n] = "col" n : header[n] = $n
}
-NF != 0 {
+NF > 0 {
if (NF > nf_max)
nf_max = NF
@@ -39,7 +37,7 @@ NF != 0 {
sum_delta2[y] += delta[y]*delta[y]
### sample variance
- (count[y] - 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = 0
+ (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = ""
# x = row, y = col
for (x=1; x<=nf_max; x++) {
diff --git a/mean.awk b/mean.awk
index 54b2e14..6f16d8d 100644
--- a/mean.awk
+++ b/mean.awk
@@ -4,7 +4,7 @@
# calculate mean average
BEGIN {
- OFS = "%.18g"
+ OFMT = "%.18g"
sign = "[+-±]?"
decimal = "[0-9]+[.]?[0-9]*"
fraction = "[.][0-9]*"
@@ -33,3 +33,4 @@ END {
print "mean", "std_dev", "std_err", "count"
print mean, sqrt(var), sqrt(var/count), count
}
+
diff --git a/mean_avg.awk b/mean_avg.awk
index 6f5a270..565dcc2 100644
--- a/mean_avg.awk
+++ b/mean_avg.awk
@@ -4,20 +4,17 @@
# average columns of numerical data
BEGIN {
- OFS = "%.18g"
- sign = "[+-±]?"
- decimal = "[0-9]+[.]?[0-9]*"
- fraction = "[.][0-9]*"
- exponent = "([Ee]" sign "[0-9]+)?"
- number = "^" sign "(" decimal "|" fraction ")" exponent "$"
+ OFMT = "%.18g"
+ #sign = "[+-]?"
+ #decimal = "[0-9]+[.]?[0-9]*"
+ #fraction = "[.][0-9]*"
+ #exponent = "([Ee]" "[+-]?" "[0-9]+)?"
+ number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]*)([Ee][+-]?[0-9]+)?$"
}
NR == 1 {
- header_nf = NF
- for (n=1; n<=NF; n++) {
- if ($n !~ number)
- header[n] = $n
- }
+ for (n=1; n<=NF; n++)
+ ($n ~ number) ? header[n] = "col" n : header[n] = $n
}
# Welford's 'online' algorithm for variance
@@ -28,10 +25,10 @@ NF > 0 {
if ($n ~ number) {
count[n] += 1
delta0[n] = $n - mean[n]
- mean[n] = mean[n] + delta0[n]/count[n]
+ mean[n] += delta0[n]/count[n]
delta1[n] = $n - mean[n]
- M2[n] = M2[n] + delta0[n]*delta1[n]
- (count[n] > 1) ? var[n] = M2[n]/(count[n] - 1) : var[n] = "0"
+ M2[n] += delta0[n]*delta1[n]
+ (count[n] > 1) ? var[n] = M2[n]/(count[n] - 1) : var[n] = ""
}
}
}
@@ -42,7 +39,7 @@ END {
if (header[n])
print header[n], mean[n], sqrt(var[n]), sqrt(var[n]/count[n]), count[n]
else
- print "col_" n, mean[n], sqrt(var[n]), sqrt(var[n]/count[n]), count[n]
+ print "col" n, mean[n], sqrt(var[n]), sqrt(var[n]/count[n]), count[n]
}
}