File size: 1,284 Bytes
23add19 d4d0c64 23add19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
def load_file(filename):
with open(filename, 'r') as f:
header = f.readline().strip().split(";")
return header, [line.strip().split(";") for line in f if line.strip()]
def remove_duplicates(data):
keys = set()
_data = []
for item in data:
key = tuple((item[0], item[1], item[2], item[3], item[-1]))
if key in keys:
continue
_data += [item]
keys.add(key)
return _data
def fix_arxiv_links(data):
return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
def sort_data(data):
return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
def main():
header, data = load_file("contamination_report.csv")
data = sort_data(data)
data = remove_duplicates(data)
data = fix_arxiv_links(data)
print("Total datapoints:", len(data))
with open("contamination_report.csv", 'w') as f:
f.write(";".join(header) + "\n")
past_key = None
for line in data:
key = tuple((line[0], line[1]))
if key != past_key:
f.write("\n")
past_key = key
line = line[:3] + line[3:]
f.write(";".join(line) + "\n")
if __name__ == "__main__":
main() |