# The amount to attenuate the Q and K matrices of the *FIRST COPY* of each layer. # NOTE: This scales the score matrix values by QK_ATTENUATION_FACTOR^2 (eg: sqrt(1/2)^2 = 1/2). const_tag: &QK_ATTENUATION_FACTOR 0.84 # ≈ sqrt(1/2) <- This was changed, v4 was 0.7071067812 ) # The amount to scale the contribution to the residual stream (to hopefully reduce overshoot). const_tag: &RESIDUAL_SCALE_FACTOR 0.71 # ≈ sqrt(1/2) <- This was changed, v4 was 0.7071067812 ) # Make the first copy *ONLY* take a more "bird's eye view" (ie: pay attention to more of the context). model1-filter-env: &MODEL1_FILTER_ENV parameters: scale: - filter: q_proj value: *QK_ATTENUATION_FACTOR - filter: k_proj value: *QK_ATTENUATION_FACTOR - filter: down_proj value: *RESIDUAL_SCALE_FACTOR - value: 1.0 # Make the scond copy pay attention to the context as before. model2-filter-env: &MODEL2_FILTER_ENV parameters: scale: - filter: down_proj value: *RESIDUAL_SCALE_FACTOR - value: 1.0 slices: # The first 10 layers are not duplicated. - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [0, 10] - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [10, 11] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [10, 11] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [11, 12] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [11, 12] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [12, 13] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [12, 13] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [13, 14] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [13, 14] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [14, 15] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [14, 15] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [15, 16] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [15, 16] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [16, 17] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [16, 17] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [17, 18] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [17, 18] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [18, 19] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [18, 19] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [19, 20] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [19, 20] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [20, 21] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [20, 21] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [21, 22] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [21, 22] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [22, 23] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [22, 23] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [23, 24] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [23, 24] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [24, 25] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [24, 25] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [25, 26] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [25, 26] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [26, 27] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [26, 27] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [27, 28] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [27, 28] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [28, 29] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [28, 29] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [29, 30] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [29, 30] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [30, 31] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [30, 31] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [31, 32] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [31, 32] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [32, 33] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [32, 33] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [33, 34] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [33, 34] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [34, 35] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [34, 35] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [35, 36] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [35, 36] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [36, 37] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [36, 37] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [37, 38] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [37, 38] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [38, 39] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [38, 39] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [39, 40] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [39, 40] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [40, 41] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [40, 41] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [41, 42] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [41, 42] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [42, 43] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [42, 43] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [43, 44] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [43, 44] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [44, 45] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [44, 45] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [45, 46] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [45, 46] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [46, 47] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [46, 47] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [47, 48] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [47, 48] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [48, 49] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [48, 49] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [49, 50] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [49, 50] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [50, 51] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [50, 51] <<: *MODEL2_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [51, 52] <<: *MODEL1_FILTER_ENV - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [51, 52] <<: *MODEL2_FILTER_ENV # The last 10 layers are not duplicated. - sources: - model: Undi95/PsyMedRP-v1-20B layer_range: [52, 62] merge_method: passthrough dtype: float16