Update README.md (#11)

- Update README.md (8d430fb9887076b9cf34d9448b6cf6a41388978f)

Co-authored-by: haoyanli <haoyang-amd@users.noreply.huggingface.co>

Files changed (1) hide show

README.md CHANGED Viewed

@@ -24,7 +24,8 @@ python3 quantize_quark.py \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
         --num_calib_data 128  \
-        --model_export quark_safetensors
 # If model size is too large for single GPU, please use multi GPU instead.
 python3 quantize_quark.py \
         --model_dir $MODEL_DIR \
@@ -33,6 +34,7 @@ python3 quantize_quark.py \
         --kv_cache_dtype fp8 \
         --num_calib_data 128  \
         --model_export quark_safetensors \
         --multi_gpu
 ```
 ## Deployment
@@ -53,9 +55,9 @@ The quantization evaluation results are conducted in pseudo-quantization mode, w
   <tr>
    <td>Perplexity-wikitext2
    </td>
-   <td>5.3164
    </td>
-   <td>5.4323
    </td>
   </tr>
 </table>

         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
         --num_calib_data 128  \
+        --model_export quark_safetensors \
+        --no_weight_matrix_merge
 # If model size is too large for single GPU, please use multi GPU instead.
 python3 quantize_quark.py \
         --model_dir $MODEL_DIR \
         --kv_cache_dtype fp8 \
         --num_calib_data 128  \
         --model_export quark_safetensors \
+        --no_weight_matrix_merge \
         --multi_gpu
 ```
 ## Deployment
   <tr>
    <td>Perplexity-wikitext2
    </td>
+   <td>3.7797
    </td>
+   <td>3.8561
    </td>
   </tr>
 </table>