From 0c1d68ee8cc2a631d6029285e771ebbfe119995d Mon Sep 17 00:00:00 2001 From: wukong Date: Sat, 7 Dec 2019 15:02:59 -0800 Subject: code formatting changes to lin_reg scripts; added range to output of mean scripts; --- ABOUT.TXT | 2 +- diff.awk | 2 +- diff1.awk | 30 +++++++++++++++++------------- lin_reg1.awk | 1 - lin_reg2.awk | 1 - mean.awk | 17 +++++++++-------- mean_avg.awk | 9 +++++---- 7 files changed, 33 insertions(+), 29 deletions(-) diff --git a/ABOUT.TXT b/ABOUT.TXT index b16ad44..07ab7ec 100644 --- a/ABOUT.TXT +++ b/ABOUT.TXT @@ -1,2 +1,2 @@ repo: awk; -desc: experiments in awk, etc.; +desc: experiments in using awk for mathematical and statistical calculations; diff --git a/diff.awk b/diff.awk index ba452c7..bf8b472 100644 --- a/diff.awk +++ b/diff.awk @@ -1,7 +1,7 @@ #!/usr/bin/awk -f ### diff.awk -# numerical diff along columns +# print numerical diff along columns BEGIN { OFS = FS diff --git a/diff1.awk b/diff1.awk index cf1f2ff..7322e98 100644 --- a/diff1.awk +++ b/diff1.awk @@ -1,9 +1,11 @@ #!/usr/bin/awk -f ### diff1.awk -# numerical diff along columns +# print numerical diff along columns BEGIN { + OFS = FS + # OFMT = "%.9g" sign = "[+-]?" decimal = "[0-9]+[.]?[0-9]*" fraction = "[.][0-9]*" @@ -12,7 +14,7 @@ BEGIN { } NR == 1 { - ### orig data columns + # orig data columns for (n=1; n<=NF; n++) { if ($n !~ number) { header[n] = $n @@ -31,18 +33,20 @@ NF { if (NF > nf_max) nf_max = NF - ### data columns - for (y=1; y<=nf_max; y++) { - if ($y == header[y] || $y == dheader[y]) + # data columns + for (n=1; n<=nf_max; n++) { + if ($n == header[n] || $n == dheader[n]) continue - if ($y ~ number) { - data[y] = $y - (data_prev[y] ~ number) ? diff[y] = data[y] - data_prev[y] : diff[y] = "" - data_prev[y] = data[y] - printf(OFMT, diff[y]) + if ($n ~ number) { + data[n] = $n + if (data_prev[n] ~ number) + diff[n] = data[n] - data_prev[n] + else + diff[n] = "" + data_prev[n] = data[n] } - else - diff[y] = "" - printf(y < nf_max ? OFS : ORS) + printf(diff[n]) + printf(n < nf_max ? OFS : ORS) } } + diff --git a/lin_reg1.awk b/lin_reg1.awk index ae59c77..c657035 100644 --- a/lin_reg1.awk +++ b/lin_reg1.awk @@ -36,7 +36,6 @@ NF > 0 { sum_delta[y] += delta1[y] sum_delta2[y] += delta0[y]*delta1[y] - # x = row, y = col, trendline: y = A + Bx for (x=1; x<=nf_max; x++) { count[x,y] += 1 diff --git a/lin_reg2.awk b/lin_reg2.awk index f06d890..450f0f0 100644 --- a/lin_reg2.awk +++ b/lin_reg2.awk @@ -36,7 +36,6 @@ NF > 0 { sum_delta[y] += delta1[y] sum_delta2[y] += delta0[y]*delta1[y] - # x = row, y = col, trendline: y = A + Bx for (x=1; x<=nf_max; x++) { count[x,y] += 1 diff --git a/mean.awk b/mean.awk index 754acfe..069db4c 100644 --- a/mean.awk +++ b/mean.awk @@ -5,20 +5,21 @@ BEGIN { OFS = FS - sign = "[+-]?" - decimal = "[0-9]+[.]?[0-9]*" - fraction = "[.][0-9]+" - exponent = "([Ee]" sign "[0-9]+)?" - number = "^" sign "(" decimal "|" fraction ")" exponent "$" + #sign = "[+-]?" + #decimal = "[0-9]+[.]?[0-9]*" + #fraction = "[.][0-9]*" + #exponent = "([Ee]" "[+-]?" "[0-9]+)?" + number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]*)([Ee][+-]?[0-9]+)?$" } # Welford's 'online' algorithm for variance -NF > 0 { +NF { for (n=1; n<=NF; n++) { if ($n ~ number) { count += 1 (count == 1 || $n < min) ? min = $n : min = min (count == 1 || $n > max) ? max = $n : max = max + range = max - min delta0 = $n - mean mean += delta0/count delta1 = $n - mean @@ -29,7 +30,7 @@ NF > 0 { } END { - print "mean", "std_err", "std_dev", "min", "max", "total", "count" - print mean, sqrt(var/count), sqrt(var), min, max, (mean*count), count + print "mean", "std_err", "std_dev", "range", "min", "max", "total", "count" + print mean, sqrt(var/count), sqrt(var), range, min, max, (mean*count), count } diff --git a/mean_avg.awk b/mean_avg.awk index e4596b0..bef984d 100644 --- a/mean_avg.awk +++ b/mean_avg.awk @@ -20,7 +20,7 @@ NR == 1 { } # Welford's 'online' algorithm for variance -NF > 0 { +NF { if (NF > max_nf) max_nf = NF for (n=1; n <= NF; n++) { @@ -28,6 +28,7 @@ NF > 0 { count[n] += 1 (count[n] == 1 || $n < min[n]) ? min[n] = $n : min[n] = min[n] (count[n] == 1 || $n > max[n]) ? max[n] = $n : max[n] = max[n] + range[n] = max[n] - min[n] delta0[n] = $n - mean[n] mean[n] += delta0[n]/count[n] delta1[n] = $n - mean[n] @@ -38,12 +39,12 @@ NF > 0 { } END { - print "col", "mean", "std_err", "std_dev", "min", "max", "total", "count" + print "col", "mean", "std_err", "std_dev", "range", "min", "max", "total", "count" for (n=1; n<=max_nf; n++) { if (header[n]) - print header[n], mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), min[n], max[n], mean[n]*count[n], count[n] + print header[n], mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), range[n], min[n], max[n], mean[n]*count[n], count[n] else - print "col" n, mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), min[n], max[n], mean[n]*count[n], count[n] + print "col" n, mean[n], sqrt(var[n]/count[n]), sqrt(var[n]), range[n], min[n], max[n], mean[n]*count[n], count[n] } } -- cgit v1.2.3