summaryrefslogtreecommitdiff
path: root/mean_avg.awk
blob: 23b1e738685b6289720a95f8b36bf6c1c0afc3fc (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/awk

### mean_avg.awk
# average columns of numerical data

BEGIN {
    sign = "[+-±]?"
    decimal = "[0-9]+[.]?[0-9]*"
    fraction = "[.][0-9]*"
    exponent = "([Ee]" sign "[0-9]+)?"
    number = "^" sign "(" decimal "|" fraction ")" exponent "$"
}

NR == 1 {
    header_nf = NF
    for (n=1; n<=NF; n++) {
        if ($n !~ number)
            header[n] = $n
    }
}

# Welford's 'online' algorithm for variance
NF != 0 {
    if (NF > max_nf)
        max_nf = NF
    for (n=1; n <= NF; n++) {
        if ($n ~ number) {
            count[n] += 1
            delta0[n] = $n - mean[n]
            mean[n] = mean[n] + delta0[n]/count[n]
            delta1[n] = $n - mean[n]
            M2[n] = M2[n] + delta0[n]*delta1[n]
            if (count[n] > 1)
                var[n] = M2[n]/(count[n] - 1)
            else
                var[n] = "0"
        }
    }
}

END {
    printf("\n")
    printf("%s, \t%s,  %s,  %s,  %s\n", "col", "mean", "std_dev", "std_err", "count")
    for (n=1; n<=max_nf; n++) {
        if (header[n])
            printf("%s, \t", header[n])
        else
            printf("col_%g, \t", n)
        if (count[n]) {
            printf("%g,  ±%g,  ±%g,  %g\n",
                mean[n], sqrt(var[n]), 1.96*sqrt(var[n]/count[n]), count[n])
        }
        else
            printf(",,,,\n")
    }
}