File size: 6,184 Bytes
2ed3459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
import numpy as np
from pathlib import Path

def combine_databases():
    # Define paths
    aggregated_data_path = Path("aggregated_data")
    db_update_bio_path = Path("db_update")
    biorxiv_embeddings_path = Path("biorxiv_ubin_embaddings.npy")
    embed_update_bio_path = Path("embed_update")

    db_update_med_path = Path("db_update_med")
    embed_update_med_path = Path("embed_update_med")

    # Load existing database and embeddings for BioRxiv
    df_bio_existing = pd.read_parquet(aggregated_data_path)
    bio_embeddings_existing = np.load(biorxiv_embeddings_path, allow_pickle=True)
    print(f"Existing BioRxiv data shape: {df_bio_existing.shape}, Existing BioRxiv embeddings shape: {bio_embeddings_existing.shape}")

    # Determine the embedding size from existing embeddings
    embedding_size = bio_embeddings_existing.shape[1]

    # Prepare lists to collect new updates
    bio_dfs_list = []
    bio_embeddings_list = []

    # Helper function to process updates from a specified directory
    def process_updates(new_data_directory, updated_embeddings_directory, dfs_list, embeddings_list):
        new_data_files = sorted(Path(new_data_directory).glob("*.parquet"))
        for data_file in new_data_files:
            corresponding_embedding_file = Path(updated_embeddings_directory) / (data_file.stem + ".npy")

            if corresponding_embedding_file.exists():
                df = pd.read_parquet(data_file)
                new_embeddings = np.load(corresponding_embedding_file, allow_pickle=True)

                # Check if the number of rows in the DataFrame matches the number of rows in the embeddings
                if df.shape[0] != new_embeddings.shape[0]:
                    print(f"Shape mismatch for {data_file.name}: DataFrame has {df.shape[0]} rows, embeddings have {new_embeddings.shape[0]} rows. Skipping.")
                    continue

                # Check embedding size and adjust if necessary
                if new_embeddings.shape[1] != embedding_size:
                    print(f"Skipping {data_file.name} due to embedding size mismatch.")
                    continue

                dfs_list.append(df)
                embeddings_list.append(new_embeddings)
            else:
                print(f"No corresponding embedding file found for {data_file.name}")

    # Process updates from both BioRxiv and MedRxiv
    process_updates(db_update_bio_path, embed_update_bio_path, bio_dfs_list, bio_embeddings_list)

    # Concatenate all BioRxiv updates
    if bio_dfs_list:
        df_bio_updates = pd.concat(bio_dfs_list)
    else:
        df_bio_updates = pd.DataFrame()

    if bio_embeddings_list:
        bio_embeddings_updates = np.vstack(bio_embeddings_list)
    else:
        bio_embeddings_updates = np.array([])

    # Append new BioRxiv data to existing, handling duplicates as needed
    df_bio_combined = pd.concat([df_bio_existing, df_bio_updates])

    # Create a mask for filtering unique titles
    bio_mask = ~df_bio_combined.duplicated(subset=["title"], keep="last")
    df_bio_combined = df_bio_combined[bio_mask]

    # Combine BioRxiv embeddings, ensuring alignment with the DataFrame
    bio_embeddings_combined = (
        np.vstack([bio_embeddings_existing, bio_embeddings_updates])
        if bio_embeddings_updates.size
        else bio_embeddings_existing
    )

    # Filter the embeddings based on the DataFrame unique entries
    bio_embeddings_combined = bio_embeddings_combined[bio_mask]

    assert df_bio_combined.shape[0] == bio_embeddings_combined.shape[0], "Shape mismatch between BioRxiv DataFrame and embeddings"

    print(f"Filtered BioRxiv DataFrame shape: {df_bio_combined.shape}")
    print(f"Filtered BioRxiv embeddings shape: {bio_embeddings_combined.shape}")

    # Save combined BioRxiv DataFrame and embeddings
    combined_biorxiv_data_path = aggregated_data_path / "combined_biorxiv_data.parquet"
    df_bio_combined.to_parquet(combined_biorxiv_data_path)
    print(f"Saved combined BioRxiv DataFrame to {combined_biorxiv_data_path}")

    combined_biorxiv_embeddings_path = "biorxiv_ubin_embaddings.npy"
    np.save(combined_biorxiv_embeddings_path, bio_embeddings_combined)
    print(f"Saved combined BioRxiv embeddings to {combined_biorxiv_embeddings_path}")

    # Prepare lists to collect new MedRxiv updates
    med_dfs_list = []
    med_embeddings_list = []

    process_updates(db_update_med_path, embed_update_med_path, med_dfs_list, med_embeddings_list)

    # Concatenate all MedRxiv updates
    if med_dfs_list:
        df_med_combined = pd.concat(med_dfs_list)
    else:
        df_med_combined = pd.DataFrame()

    if med_embeddings_list:
        med_embeddings_combined = np.vstack(med_embeddings_list)
    else:
        med_embeddings_combined = np.array([])

    last_date_in_med_database = df_med_combined['date'].max() if not df_med_combined.empty else "unknown"

    # Create a mask for filtering unique titles
    med_mask = ~df_med_combined.duplicated(subset=["title"], keep="last")
    df_med_combined = df_med_combined[med_mask]
    med_embeddings_combined = med_embeddings_combined[med_mask]

    assert df_med_combined.shape[0] == med_embeddings_combined.shape[0], "Shape mismatch between MedRxiv DataFrame and embeddings"

    print(f"Filtered MedRxiv DataFrame shape: {df_med_combined.shape}")
    print(f"Filtered MedRxiv embeddings shape: {med_embeddings_combined.shape}")

    # Save combined MedRxiv DataFrame and embeddings
    combined_medrxiv_data_path = db_update_med_path / f"database_{last_date_in_med_database}.parquet"
    df_med_combined.to_parquet(combined_medrxiv_data_path)
    print(f"Saved combined MedRxiv DataFrame to {combined_medrxiv_data_path}")

    combined_medrxiv_embeddings_path = embed_update_med_path / f"database_{last_date_in_med_database}.npy"
    np.save(combined_medrxiv_embeddings_path, med_embeddings_combined)
    print(f"Saved combined MedRxiv embeddings to {combined_medrxiv_embeddings_path}")

if __name__ == "__main__":
    combine_databases()