import sys

#   The key observation here is IP.DF + DNS.AA can uniquely distinguish different processes of GFW:
#     * Injector 1: dns_aa = 1 + DF=0 + IPID(0,65535)
#     * Injector 2: dns_aa = 0 + DF=1 + IPID(0,65535)
#     * Injector 3: dns_aa = 0 + DF=0 + IPID(0)

i1 = open("injector1.csv", 'w')
i2 = open("injector2.csv", 'w')
i3 = open("injector3.csv", 'w')

with open(sys.argv[1]) as f:
    # copy header
    line = f.readline()
    i1.write(line)
    i2.write(line)
    i3.write(line)
    
    line = f.readline()    
    while line:
        ts, qname, rtype, answer, cname, dns_ttl, dns_flags, dns_aa, ip_ttl, ip_id, ip_flags, df, tos = line.rstrip().split(";")

        # we freeze the dataset by time between 1568692823.487619 and 1590094531.2941742.
        # This condition can make the process extremely slow, use split_by_awk_new.sh instead or comment it out when not necessary.
        ts = float(ts)
        if ts < 1568692823.487619 or ts > 1590094531.2941742:
            continue
       
        if dns_aa == '1' and df == '0':
            i1.write(line)
        elif dns_aa == '0' and df == '1':
            i2.write(line)            
        elif dns_aa == '0' and df == '0' and ip_id == '0x00000000':
            i3.write(line)
        else:
            print("error", line)
            print("dns_aa: {}".format(dns_aa))
            print("df: {}".format(df))
            print("IPID: {}".format(ip_id))
        line = f.readline()            
