summaryrefslogtreecommitdiff
path: root/lin_reg.awk
blob: 5a9df56f341d4c9fc2617c858f7a7231a87b0eea (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/awk -f

### lin_reg.awk
# simple linear regression between columns

BEGIN {
    OFS = ":"
    sign = "[+-]?"
    decimal = "[0-9]+[.]?[0-9]*"
    fraction = "[.][0-9]*"
    exponent = "([Ee]" sign "[0-9]+)?"
    number = "^" sign "(" decimal "|" fraction ")" exponent "$"
}

NR == 1 {
    for (n=1; n<=NF; n++)
        ($n ~ number) ? header[n] = "col" n : header[n] = $n
}

NF {
    if (NF > nf_max)
        nf_max = NF

    ### iterate over columns
    for (y=1; y<=nf_max; y++) {
        if ($y == header[n])
            continue

        if ($y ~ number) {

            ### mean
            count[y] += 1
            sum[y] += $y
            sum2[y] += $y*$y
            delta0[y] = $y - mean[y]
            mean[y] += delta0[y]/count[y]
            delta1[y] = $y - mean[y]
            sum_delta[y] += delta1[y]
            sum_delta2[y] += delta0[y]*delta1[y]

            ### sample variance
            #(count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = ""

            # x = row, y = col, trendline: y = A + Bx
            for (x=1; x<=nf_max; x++) {
                if ($x ~ number) {
                    count[x,y] += 1
                    sum_xy[x,y] += $x*$y
                    sum_delta_xy[x,y] += delta0[x]*delta1[y]

                    # covariance
                    #(count[x,y] > 1) ? cov[x,y] = sum_delta_xy[x,y]/(count[x,y] - 1) : cov[x,y] = ""

                    # correlation
                    r_den[x,y] = sqrt(sum_delta2[x]*sum_delta2[y])
                    (r_den[x,y]) ? r[x,y] = sum_delta_xy[x,y]/r_den[x,y] : r[x,y] = 1

                    ab_den[x,y] = (count[x,y]*sum2[x] - sum[x]*sum[x])
                    if (ab_den[x,y]) {
                        a[x,y] = (sum[y]*sum2[x] - sum[x]*sum_xy[x,y])/ab_den[x,y]
                        b[x,y] = (count[x,y]*sum_xy[x,y] - sum[x]*sum[y])/ab_den[x,y]
                    }
                    else {
                        a[x,y] = 0
                        b[x,y] = 1
                    }

                    # error estimate
                    err_den[x,y] = count[x,y]*(count[x,y] - 2)
                    if (count[x,y] > 2) {
                        err[x,y] = $y - (a[x,y] + b[x,y]*$x)
                        sum_err2[x,y] += err[x,y]*err[x,y]
                    }
                    b_err_den[x,y] = (count[x,y] - 2)*sum_delta2[x]
                    if (b_err_den[x,y])
                        b_err[x,y] = sqrt(sum_err2[x,y]/b_err_den[x,y])
                    a_err_den[x,y] = count[x,y]*b_err_den[x,y]
                    if (a_err_den[x,y])
                        a_err[x,y] = sqrt(sum2[x]/count[x,y])*b_err[x,y]

                    # weighted mean, from HP-20S manual, pg 60
                    # xw[x,y] = sum_xy[x,y]/sum[y]
                    # yw[x,y] = b[x,y]*xw[x,y] + a[x,y]
                    # xw_dist[x,y] = (xw[x,y] - mean[x])
                    # yw_dist[x,y] = b[x,y]*(xw[x,y] - mean[x])
                }
            }
        }
        else
            continue
    }
}

END {
    for (y=1; y<=nf_max; y++) {
        for (x=1; x<=nf_max; x++) {
            if (x != y && r[x,y]) {
                printf(OFMT OFS "(%s)" OFS " = (" OFMT " +/- " OFMT ")(%s) + (" OFMT " +/- " OFMT ")" OFS,
                    (r[x,y]*r[x,y]),
                    header[y], b[x,y], b_err[x,y],
                    header[x], a[x,y], a_err[x,y])
                printf("[" OFMT "," OFMT "][" OFMT "," OFMT "]" OFS "[" OFMT "," OFMT "]" ORS,
                    0, a[x,y], (-1.0*a[x,y]/b[x,y]), 0,
                    mean[x], b[x,y]*(mean[x]) + a[x,y])
                # printf("[" OFMT "," OFMT "]" OFS, xw[x,y], yw[x,y])
                # printf("[" OFMT "]" ORS, sqrt(xw_dist[x,y]*xw_dist[x,y] + yw_dist[x,y]*yw_dist[x,y]))
            }
        }
    }
}