added summations to quad_reg

added in-progress lpf.awk (low pass filter), an adaptation of convolution script (conv.awk) to use delmitied columns as input additional tweaking of OFMT, OFS, and conditional print statements
author: wukong <wukong@longaeva> 2018-09-09 23:48:31 -0700
committer: wukong <wukong@longaeva> 2018-09-09 23:48:31 -0700
commit: d1f6c89be163d9399d569e01458242d8ce15e041 (patch)
tree: a263822af5cb1532f9eb1abe7b7ae6334c27d170
parent: e42cee748f5bc38d11742739b5e2cad4b6a07c43 (diff)
10 files changed, 194 insertions, 46 deletions
diff --git a/ABOUT.TXT b/ABOUT.TXT
index f471f31..b16ad44 100644
--- a/ABOUT.TXT
+++ b/ABOUT.TXT
@@ -1,3 +1,2 @@
-repo:	awk
-
-desc:	experiments in awk, etc.
+repo:	awk;
+desc:	experiments in awk, etc.;
diff --git a/diff.awk b/diff.awk
index c90a0e0..015669a 100644
--- a/diff.awk
+++ b/diff.awk
@@ -4,6 +4,7 @@
 # numerical diff along columns
 
 BEGIN {
+    OFS = FS
     sign = "[+-]?"
     decimal = "[0-9]+[.]?[0-9]*"
     fraction = "[.][0-9]*"
diff --git a/lpf.awk b/lpf.awk
new file mode 100644
index 0000000..814020f
--- /dev/null
+++ b/lpf.awk
@@ -0,0 +1,125 @@
+#!/usr/bin/awk -f
+
+### lpf.awk
+# Low Pass Filter with Hardcoded FIR Window
+
+BEGIN {
+    OFS = FS
+    sign = "[+-]?"
+    decimal = "[0-9]+[.]?[0-9]*"
+    fraction = "[.][0-9]*"
+    exponent = "([Ee]" sign "[0-9]+)?"
+    number = "^" sign "(" decimal "|" fraction ")" exponent "$"
+
+    #H = ARGV[1]
+    #H = 1.0
+    #H = "1.00   1.00    1.00"   # rect
+    #H = "0.25   0.50    0.25"   # von Hann
+    H = "0.23   0.54    0.23"   # Hamming
+    window_size = split(H, H_arr, "[ ]*")
+}
+
+NR == 1 {
+    for (y=1; y<=NF; y++)
+        ($y ~ number) ? header[y] = "col" y : header[y] = $y
+}
+
+NF > 0 {
+    if (NF > nf_max)
+        nf_max = NF
+
+    input_size = window_size 
+    output_size = (input_size + window_size - 1)
+
+    ### columns
+    for (y=1; y<=nf_max; y++) {
+        if ($y == header[y])
+            printf(header[y] OFS header[y] "_lpf")
+        if ($y ~ number) {
+            count[y]++
+
+            # rotate input buffer
+            for (n=1; n<=input_size; n++) {
+                X_arr[y,n] = X_arr[y,n+1]
+            }
+            X_arr[y,input_size] = $y
+
+            Y[y] = 0
+            for (n=1; n<=window_size; n++) {
+                for (m=1; m<=input_size; m++) {
+                    if (n <= window_size) {
+                        Y[y] += H_arr[n-m+1]*X_arr[y,m]
+                        continue
+                    }
+                    if ((n > window_size) && (n <= input_size)) {
+                        Y[y] += H_arr[n-m+1]*X_arr[y,m]
+                        continue
+                    }
+                    if ((n > window_size) && (n > input_size)) {
+                        Y[y] += H_arr[n-m+1]*X_arr[y,m]
+                        continue
+                    }
+                    else {
+                        Y[y] += 0
+                        continue
+                    }
+                }
+            }
+            printf(X_arr[y,input_size] OFS Y[y])
+        }
+
+        if (y < nf_max)
+            printf(OFS)
+        else
+            printf(ORS)
+        
+    }
+
+
+
+}
+
+END {
+    ### rows
+    for (x=1; x<=window_size; x++) {
+        ### columns
+        for (y=1; y<=nf_max; y++) {
+            # rotate input buffer
+            for (n=1; n<=input_size; n++) {
+                X_arr[y,n] = X_arr[y,n+1]
+                #print X_arr[y,n]
+            }
+            #delete X_arr[y,input_size]
+            #input_size = length(X_arr)
+            #print length(X_arr)
+            
+            Y[y] = 0
+            for (n=1; n<=window_size; n++) {
+                for (m=1; m<=input_size; m++) {
+                    if (n <= window_size) {
+                        Y[y] += H_arr[n-m+1]*X_arr[y,m]
+                        continue
+                    }
+                    if ((n > window_size) && (n <= input_size)) {
+                        Y[y] += H_arr[n-m+1]*X_arr[y,m]
+                        continue
+                    }
+                    if ((n > window_size) && (n > input_size)) {
+                        Y[y] += H_arr[n-m+1]*X_arr[y,m]
+                        continue
+                    }
+                    else {
+                        Y[y] += 0
+                        continue
+                    }
+                }
+            }
+            printf(X_arr[y,input_size] OFS Y[y])
+            if (y < nf_max)
+                printf(OFS)
+            else
+                printf(ORS)
+            X_arr[input_size] = 0 
+        }
+    }
+}
diff --git a/mean.awk b/mean.awk
index aa5ec8e..b509473 100644
--- a/mean.awk
+++ b/mean.awk
@@ -4,6 +4,7 @@
 # calculate mean average
 
 BEGIN {
+    OFS = FS
     sign = "[+-±]?"
     decimal = "[0-9]+[.]?[0-9]*"
     fraction = "[.][0-9]*"
diff --git a/mean_avg.awk b/mean_avg.awk
index aeee3db..8a5a3c5 100644
--- a/mean_avg.awk
+++ b/mean_avg.awk
@@ -4,6 +4,7 @@
 # average columns of numerical data
 
 BEGIN {
+    OFS = FS
     #sign = "[+-]?"
     #decimal = "[0-9]+[.]?[0-9]*"
     #fraction = "[.][0-9]*"
diff --git a/quad_reg.awk b/quad_reg.awk
index 8939947..1b30afd 100644
--- a/quad_reg.awk
+++ b/quad_reg.awk
@@ -1,9 +1,10 @@
 #!/usr/bin/awk -f
 
-### lin_reg2.awk
+### quad_reg.awk
 # simple linear regression between columns
 
 BEGIN {
+    OFMT="%.9g"
     sign = "[+-]?"
     decimal = "[0-9]+[.]?[0-9]*"
     fraction = "[.][0-9]*"
@@ -28,7 +29,10 @@ NF > 0 {
             count[y] += 1
             sum[y] += $y
             sum2[y] += $y*$y
+            sum3[y] += $y*$y*$y
+            sum4[y] += $y*$y*$y*$y
             mean[y] = sum[y]/count[y]
+            mean2[y] = sum2[y]/count[y]
 
             ### difference from the mean
             delta[y] = $y - mean[y]
@@ -38,38 +42,39 @@ NF > 0 {
             ### sample variance
             (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = ""
 
-            # x = row, y = col
+            # x = row, y = col, trendline: y = A + Bx + Cx^2
             for (x=1; x<=nf_max; x++) {
                 count[x,y] += 1
                 sum_xy[x,y] += $x*$y
+                sum_x2y[x,y] += $x*$x*$y
                 sum_delta_xy[x,y] += delta[x]*delta[y]
 
-                # correlation
-                r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y])
-                (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 1
+                # covariances
+                if (count[x,y] > 1) {
+                    s_xx[x,y] = sum2[x]/(count[x,y]) - mean[x]*mean[x]
+                    s_xy[x,y] = sum_xy[x,y]/(count[x,y]) - mean[x]*mean[y]
+                    s_xx2[x,y] = sum3[x]/(count[x,y]) - mean[x]*mean2[x]
+                    s_x2x2[x,y] = sum4[x]/(count[x,y]) - mean2[x]*mean2[x]
+                    s_x2y[x,y] = sum_x2y[x]/(count[x,y]) - mean2[x]*mean[y]
+                }
 
-                ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x])
-                if (ab_den[x,y]) {
-                    a[x,y] = (sum[y]*sum2[x] - sum[x]*sum_xy[x,y])/ab_den[x,y]
-                    b[x,y] = (count[x,y]*sum_xy[x,y] - sum[x]*sum[y])/ab_den[x,y]
+                bc_den[x,y] = (s_xx[x,y]*s_x2x2[x,y] - s_xx2[x,y]*s_xx2[x,y])
+                if (bc_den[x,y]) {
+                    c[x,y] = (s_x2y[x,y]*s_xx[x,y] - s_xy[x,y]*s_xx2[x,y])/bc_den[x,y]
+                    b[x,y] = (s_xy[x,y]*s_x2x2[x,y] - s_x2y[x,y]*s_xx2[x,y])/bc_den[x,y]
                 }
                 else {
-                    a[x,y] = 0
-                    b[x,y] = 1
+                    c[x,y] = 0
+                    b[x,y] = 0
                 }
+                a[x,y] = mean[y] - b[x,y]*mean[x] - c[x,y]*mean[x]*mean[x]
 
-                ### error estimate
-                err_den[x,y] = count[x,y]*(count[x,y] - 2)
-                if (count[x,y] > 2) {
-                    err[x,y] = $y - (a[x,y] + b[x,y]*$x)
-                    sum_err2[x,y] += err[x,y]*err[x,y]
-                }
-                b_err_den[x,y] = (count[x,y] - 2)*sum_delta2[x]
-                if (b_err_den[x,y])
-                    b_err[x,y] = sqrt(sum_err2[x,y]/b_err_den[x,y])
-                a_err_den[x,y] = count[x,y]*b_err_den[x,y]
-                if (a_err_den[x,y])
-                    a_err[x,y] = sqrt(sum2[x]/count[x,y])*b_err[x,y]
+                # error estimate
+                err[x,y] = ($y - (a[x,y] + b[x,y]*$x + c[x,y]*$x*$x))
+                sum_err2[x,y] += err[x,y]*err[x,y]
+
+                # correlation
+                sum_delta2[y] ? r2[x,y] = sum_err2[x,y]/sum_delta2[y] : r2[x,y] = 1
             }
         }
         else
@@ -78,12 +83,11 @@ NF > 0 {
 }
 
 END {
-    for (y=1; y<=nf_max; y++) {
-        for (x=1; x<=nf_max; x++) {
-            if (x != y && r[x,y]) {
-                printf(OFMT OFS "(%s)" OFS " = (" OFMT " +/- " OFMT ")(%s)" OFS " + (" OFMT " +/- " OFMT ")" ORS,
-                    10.0*log(r[x,y]*r[x,y])/log(10), header[y], b[x,y], b_err[x,y], header[x],
-                    a[x,y], a_err[x,y])
+    for (x=1; x<=nf_max; x++) {
+        for (y=1; y<=nf_max; y++) {
+            if (x != y && r2[x,y]) {
+                printf(OFMT OFS "(%s)" OFS " = (" OFMT ")(%s)^2" OFS " + (" OFMT ")(%s)" OFS " + (" OFMT ")" ORS,
+                    10.0*log(r2[x,y])/log(10), header[y], c[x,y], header[x], b[x,y], header[x], a[x,y])
             }
         }
     }
diff --git a/sum1.awk b/sum1.awk
index 61d96e0..8f1419b 100644
--- a/sum1.awk
+++ b/sum1.awk
@@ -5,11 +5,13 @@
 # output: sum of each column
 #   missing entries are treated as zeros
 
+BEGIN { OFS = FS }
+
 {
-    for (i=1; i<=NF; i++)
-        sum[i] += $i
     if (NF > nf_max)
         nf_max = NF
+    for (i=1; i<=NF; i++)
+        sum[i] += $i
 }
 
 END {
@@ -18,3 +20,4 @@ END {
         printf((i < nf_max) ? OFS : ORS)
     }
 }
+
diff --git a/sum2.awk b/sum2.awk
index 979d133..694c047 100644
--- a/sum2.awk
+++ b/sum2.awk
@@ -3,7 +3,9 @@
 ### sum2.awk, print column sums
 # check that each line has the same number of fields as line one
 
-NR==1 { nf_max = NF }
+BEGIN { OFS = FS }
+
+NR == 1 { nf_max = NF }
 
 {
     for (i=1; i<=NF; i++)
@@ -13,6 +15,6 @@ NR==1 { nf_max = NF }
 }
 
 END {
-    for (i=1; i<=NF; i++)
+    for (i=1; i<=nf_max; i++)
         printf(OFMT "%s", sum[i], i < nf_max ? OFS : ORS)
 }
diff --git a/sum3.awk b/sum3.awk
index bca92e3..3e4661f 100644
--- a/sum3.awk
+++ b/sum3.awk
@@ -11,7 +11,9 @@ function isnum(n) {
 }
 
 
-NR==1 {
+BEGIN { OFS = FS }
+
+NR == 1 {
     nfld = NF
     for (i=1; i<=NF; i++)
         numcol[i] = isnum($i)
diff --git a/sum4.awk b/sum4.awk
index 6a06455..aa3f044 100644
--- a/sum4.awk
+++ b/sum4.awk
@@ -4,6 +4,7 @@
 # input:    rows of integers and strings
 # output:   sums of numeric columns
 
+
 function isnum(n) {
     sign = "[+-]?"
     decimal = "[0-9]+[.]?[0-9]*"
@@ -13,24 +14,33 @@ function isnum(n) {
     return n ~ number
 }
 
-NR==1 {
+
+BEGIN { OFS = FS }
+
+NR == 1 {
     nf_max = NF
-    for (i=1; i<=NF; i++) {
-        (!isnum($i)) ? header[i] = $i : header[i] = "col" i
-    }
+    for (i=1; i<=NF; i++)
+        isnum($i) ? header[i] = "col" i : header[i] = $i
 }
 
 {
+    if (NF > nf_max)
+        nf_max = NF
     for (i=1; i<=NF; i++) {
-        sum[i] += $i
-        count[i]++
+        if ($i == header[i])
+            continue
+        if (isnum($i)) {
+            count[i]++
+            sum[i] += $i
+        }
     }
 }
 
 END {
     for (i=1; i<=nf_max; i++) {
-        if (header[i])
-            printf("%s:" OFS, header[i])
-        printf(OFMT ORS, sum[i])
+        printf((header[i]) ? header[i] OFS : OFS)
+        printf((count[i]) ? count[i] OFS sum[i] : OFS)
+        printf(ORS)
     }
 }
+
author	wukong <wukong@longaeva>	2018-09-09 23:48:31 -0700
committer	wukong <wukong@longaeva>	2018-09-09 23:48:31 -0700
commit	d1f6c89be163d9399d569e01458242d8ce15e041 (patch)
tree	a263822af5cb1532f9eb1abe7b7ae6334c27d170
parent	e42cee748f5bc38d11742739b5e2cad4b6a07c43 (diff)