anhbn commited on 1 day ago

Commit

53282aa

verified ·

1 Parent(s): 9dafcc6

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +1 -0
README.md +525 -0
added_tokens.json +16 -0
chat_template.json +3 -0
config.json +48 -0
generation_config.json +14 -0
images/27vid-Gaza-City-Cover-gqmt-videoSixteenByNine1050 (1).jpg +0 -0
images/nguyentonnhuan.jpg +0 -0
images/nguyenvandung_back.png +0 -0
images/nguyenvandung_front.png +0 -0
images/test.txt +0 -0
images/trinhquangduy_back.jpg +0 -0
images/trinhquangduy_front.jpg +0 -0
merges.txt +0 -0
openvino_language_model.bin +3 -0
openvino_language_model.xml +0 -0
openvino_text_embeddings_model.bin +3 -0
openvino_text_embeddings_model.xml +107 -0
openvino_vision_embeddings_merger_model.bin +3 -0
openvino_vision_embeddings_merger_model.xml +0 -0
openvino_vision_embeddings_model.bin +3 -0
openvino_vision_embeddings_model.xml +167 -0
preprocessor_config.json +29 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +145 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,525 @@

+---
+license: apache-2.0
+language:
+- vi
+- en
+- zh
+base_model:
+- Qwen/Qwen2-VL-2B-Instruct
+library_name: transformers
+tags:
+- erax
+- multimodal
+- erax-vl-2B
+- insurance
+- ocr
+- vietnamese
+- bcg
+pipeline_tag: visual-question-answering
+widget:
+- src: images/photo-1-16505057982762025719470.webp
+  example_title: Test 1
+- src: images/vt-don-thuoc-f0-7417.jpeg
+  example_title: Test 2
+---
+<p align="left">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/63d8d8879dfcfa941d4d7cd9/GsQKdaTyn2FFx_cZvVHk3.png" alt="Logo">
+</p>
+# EraX-VL-7B-V1.5
+## Introduction 🎉
+Hot on the heels of the popular **<a href="https://huggingface.co/erax-ai/EraX-VL-7B-V1.0" target="_blank">EraX-VL-7B-V1.0 model</a>**, we proudly present **EraX-VL-7B-V1.5**, another robust multimodal model for **OCR (optical character recognition)** and **VQA (visual question-answering)** that excels in various languages 🌍, with a particular focus on Vietnamese 🇻🇳.  This model stands out for its precise recognition capabilities across a range of documents 📝, including medical forms 🩺, invoices 🧾, bills of sale 💳, quotes 📄, and medical records 💊. This functionality is expected to be highly beneficial for hospitals 🏥, clinics 💉, insurance companies 🛡️, and other similar applications 📋. Built on the solid foundation of the [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)[1], which we found to be of high quality and fluent in Vietnamese, `EraX-VL-7B-V1.5` has been fine-tuned to enhance its performance. We plan to continue improving and releasing new versions for free, along with sharing performance benchmarks in the near future.
+One standing-out feature of **EraX-VL-7B-V1.5** is the capability to do multi-turn Q&A with impressive reasoning capability!
+**NOTA BENE**:
+- EraX-VL-7B-V1.5 is NOT a typical OCR-only tool likes Tesseract but is a Multimodal LLM-based model. To use it effectively, you may have to **twist your prompt carefully** depending on your tasks.
+- This model was NOT finetuned with medical (X-ray) dataset or car accidences (yet). Stay tune for updated version coming up sometime early 2025.
+**EraX-VL-7B-V1.5** is a young member of our **EraX's LànhGPT** collection of LLM models.
+- **Developed by:**
+  - Nguyễn Anh Nguyên (nguyen@erax.ai)
+  - Nguyễn Hồ Nam (BCG)
+  - Phạm Huỳnh Nhật (nhat.ph@erax.ai)
+  - Phạm Đình Thục (thuc.pd@erax.ai)
+- **Funded by:** [Bamboo Capital Group](https://bamboocap.com.vn) and EraX
+- **Model type:** Multimodal Transformer with over 7B parameters
+- **Languages (NLP):** Primarily Vietnamese with multilingual capabilities
+- **License:** Apache 2.0
+- **Fine-tuned from:** [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
+- **Prompt examples:** <a href="https://github.com/EraX-JS-Company/erax-vl-7b-v1/blob/main/prompts/Vietnam_popular_prompts.txt" target="_blank">Some popular prompt examples on Github.</a>
+## Benchmarks 📊
+## 🏆 LeaderBoard
+The EraX-VL-7B-V1.5 achieved exceptionally high performance compared to other models of equal size or even **10 times larger, and we open-source**! You can re-run the benchmark at any time.
+<table style="width:75%;">
+    <tr>
+        <th align="middle" width="300">Models</th>
+        <td align="middle" width="150"><b>Open-Source</b></td>
+        <td align="middle" width="300"><b>VI-MTVQA</b></td>
+    </tr>
+    <tr>
+        <th align="middle"><font color=darkred>EraX-VL-7B-V1.5 🥇 </font></th>
+        <td align="middle">✅</td>
+        <td align="middle">47.2 </td>
+    </tr>
+    <tr>
+        <th align="middle">Qwen2-VL 72B 🥈 </th>
+        <td align="middle">✘</td>
+        <td align="middle">41.6 </td>
+    </tr>
+    <tr>
+        <th align="middle">ViGPT-VL 🥉 </th>
+        <td align="middle">✘</td>
+        <td align="middle">39.1 </td>
+    </tr>
+    <tr>
+        <th align="middle"><font color=darkred>EraX-VL-2B-V1.5</font></th>
+        <td align="middle"> ✅ </td>
+        <td align="middle">38.2 </td>
+    </tr>
+    <tr>
+        <th align="middle"><font color=darkred>EraX-VL-7B-V1 </font></th>
+        <td align="middle"> ✅ </td>
+        <td align="middle">37.6 </td>
+    </tr>
+    <tr>
+        <th align="middle"><font color=darkred>Vintern-1B-V2</font></th>
+        <td align="middle"> ✅ </td>
+        <td align="middle">37.4 </td>
+    </tr>
+    <tr>
+        <th align="middle"><font color=darkred>Qwen2-VL 7B </font></th>
+        <td align="middle"> ✅ </td>
+        <td align="middle">30.0 </td>
+    </tr>
+    <tr>
+        <th align="middle">Claude3 Opus</th>
+        <td align="middle">✘</td>
+        <td align="middle">29.1 </td>
+    </tr>
+    <tr>
+        <th align="middle">GPT-4o mini </th>
+        <td align="middle"> ✘ </td>
+        <td align="middle">29.1 </td>
+    </tr>
+    <tr>
+        <th align="middle">GPT-4V</th>
+        <td align="middle">✘</td>
+        <td align="middle">28.9 </td>
+    </tr>
+    <tr>
+        <th align="middle">Gemini Ultra</th>
+        <td align="middle">✘</td>
+        <td align="middle">28.6 </td>
+    </tr>
+     <tr>
+        <th align="middle"><font color=darkred>InternVL2 76B</font></th>
+        <td align="middle"> ✅ </td>
+        <td align="middle">26.9 </td>
+    </tr>
+    <tr>
+        <th align="middle">QwenVL Max</th>
+        <td align="middle">✘</td>
+        <td align="middle">23.5 </td>
+    </tr>
+    <tr>
+        <th align="middle">Claude3 Sonnet</th>
+        <td align="middle">✘</td>
+        <td align="middle">20.8 </td>
+    </tr>
+    <tr>
+        <th align="middle">QwenVL Plus</th>
+        <td align="middle">✘</td>
+        <td align="middle">18.1 </td>
+    </tr>
+    <tr>
+        <th align="middle"><font color=darkred>MiniCPM-V2.5</font></th>
+        <td align="middle">✅</td>
+        <td align="middle">15.3 </td>
+    </tr>
+</table>
+**The test code for evaluating models in the paper can be found in**: <b><a href="https://github.com/EraX-JS-Company/EraX-MTVQA-Benchmark" target="_blank">EraX-JS-Company/EraX-MTVQA-Benchmark</a></b>
+## API trial 🎉
+Please contact **nguyen@erax.ai** for API access inquiry.
+## Examples 🧩
+### 1. OCR - Optical Character Recognition for Multi-Images
+**Example 01: Citizen identification card**
+<div style="display: flex; flex-direction: row; align-items: center; justify-content: center;">
+  <div style="text-align: center; margin: 0 10px;">
+    <img src="images/trinhquangduy_front.jpg" width="500" alt="Front View" />
+    <p>Front View</p>
+  </div>
+  <div style="text-align: center; margin: 0 10px;">
+    <img src="images/trinhquangduy_back.jpg" width="500" alt="Back View" />
+    <p>Back View</p>
+  </div>
+</div>
+<p style="text-align: center; font-size: 12px; color: gray; margin-top: 10px;">
+  Source: <a href="https://support.google.com/google-ads/thread/270967947/t%C3%B4i-%C4%91%C3%A3-g%E1%BB%ADi-h%C3%ACnh-%E1%BA%A3nh-c%C4%83n-c%C6%B0%E1%BB%9Bc-c%C3%B4ng-d%C3%A2n-c%E1%BB%A7a-ch%C3%ADnh-t%C3%B4i-%C4%91%E1%BB%83-x%C3%A1c-minh-danh-t%C3%ADnh?hl=vi" target="_blank">Google Support</a>
+</p>
+```
+{
+    "Số thẻ": "037094012351",
+    "Họ và tên": "TRỊNH QUANG DUY",
+    "Ngày sinh": "04/09/1994",
+    "Giới tính": "Nam",
+    "Quốc tịch": "Việt Nam",
+    "Quê quán": "Tân Thành, Kim Sơn, Ninh Bình",
+    "Nơi thường trú": "Xóm 6\nTân Thành, Kim Sơn, Ninh Bình",
+    "Có giá trị đến": "04/09/2034",
+    "Đặc điểm nhân dạng": "sẹo chấm c. 1cm trên đuôi mắt trái",
+    "Nơi cấp": "CỤC TRƯỞNG CỤC CẢNH SÁT\nQUẢN LÝ HÀNH CHÍNH VỀ TRẬT TỰ XÃ HỘI",
+    "Ngày cấp": "10/12/2022",
+    "Cán bộ ký tên": "Nguyễn Quốc Hùng",
+    "Mã định danh": "IDVNM0940123513037094012351"
+}
+```
+**Example 02: Driver's License**
+<div style="display: flex; flex-direction: row; align-items: center; justify-content: center;">
+  <div style="text-align: center; margin: 0 10px;">
+    <img src="images/nguyenvandung_front.png" width="500" alt="Front View" />
+    <p>Front View</p>
+  </div>
+  <div style="text-align: center; margin: 0 10px;">
+    <img src="images/nguyenvandung_back.png" width="500" alt="Back View" />
+    <p>Back View</p>
+  </div>
+</div>
+<p style="text-align: center; font-size: 12px; color: gray; margin-top: 10px;">
+  Source: <a href="https://baophapluat.vn/khoi-to-tai-xe-len-mang-mua-giay-phep-lai-xe-gia-de-chay-xe-post481047.html" target="_blank">Báo Pháp luật</a>
+</p>
+```
+{
+  "No.":"400116012313"
+  "Fullname":"NGUYỄN VĂN DŨNG"
+  "Date_of_birth":"08/06/1979"
+  "Nationality":"VIỆT NAM"
+  "Address":"X. Quỳnh Hầu, H. Quỳnh Lưu, T. Nghệ An
+  Nghệ An, ngày/date 23 tháng/month 04 năm/year 2022"
+  "Hang_Class":"FC"
+  "Expires":"23/04/2027"
+  "Place_of_issue":"Nghệ An"
+  "Date_of_issue":"ngày/date 23 tháng/month 04 năm/year 2022"
+  "Signer":"Trần Anh Tuấn"
+  "Các loại xe được phép":"Ô tô hạng C kéo rơmoóc, đầu kéo kéo sơmi rơmoóc và xe hạng B1, B2, C, FB2 (Motor vehicle of class C with a trailer, semi-trailer truck and vehicles of classes B1, B2, C, FB2)"
+  "Mã số":""
+}
+```
+**Example 03: Vehicle Registration Certificate**
+<div style="display: flex; flex-direction: row; align-items: center; justify-content: center;">
+  <div style="text-align: center; margin: 0 10px;">
+    <img src="images/nguyentonnhuan.jpg" width="700"/>
+  </div>
+</div>
+<p style="text-align: center; font-size: 12px; color: gray; margin-top: 10px;">
+  Source: <a href="https://vietnamnet.vn/phan-biet-cac-loai-giay-dang-ky-xe-khi-mua-moto-da-qua-su-dung-541341.html" target="_blank">Báo Vietnamnet</a>
+</p>
+```
+{
+    "Tên chủ xe": "NGUYỄN TÔN NHUẬN",
+    "Địa chỉ": "KE27 Kp3 P.TTTây Q7",
+    "Nhãn hiệu": "HONDA",
+    "Số loại": "DYLAN",
+    "Màu sơn": "Trắng",
+    "Năm sản xuất": "2012",
+    "Số máy": "F03E-0057735",
+    "Số khung": "SA04F-070410",
+    "Dung tích": "152",
+    "Số chỗ ngồi": "02",
+    "Biển số đăng ký": "59V1-498.89",
+    "Đăng ký lần đầu ngày": "08/06/2004",
+    "Chức vụ": "Thượng tá",
+    "Người ký": "Trần Văn Hiểu"
+}
+```
+**Example 04: Vehicle Registration**
+<div style="display: flex; flex-direction: row; align-items: center; justify-content: center;">
+  <div style="text-align: center; margin: 10 20px;">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/63d8d8879dfcfa941d4d7cd9/w5WCaQ-k9nupRIQYddcpr.jpeg" width="700"/>
+  </div>
+</div>
+<p style="text-align: center; font-size: 12px; color: gray; margin-top: 10px;">
+  Source: <a href="https://llumar.com.vn/dang-kiem-xe-o-to/" target="_blank">https://llumar.com.vn</a>
+</p>
+```
+{
+    "vehicle": {
+        "registration_number": "30A-072.36",
+        "vehicle_inspection_number": "2903V-093515",
+        "type": "ô tô con",
+        "mark": "MERCEDES-BENZ",
+        "model_code": "C300 W204",
+        "engine_number": "27294732096079",
+        "chassis_number": "RLMGF5EX3DV005333",
+        "manufactured_year_and_country": "2013, Việt Nam",
+        "life_time_limit_to": "",
+        "commercial_use": "",
+        "modification": ""
+    },
+    "specifications": {
+        "wheel_formula": "4x2",
+        "wheel_tread": "1521/1512 (mm)",
+        "overall_dimension": "4650 x 1770 x 1429 (mm)",
+        "largest_luggage_container_dimension": "",
+        "wheelbase": "2760 (mm)",
+        "kerb_mass": "1575 (kg)",
+        "design_authorized_pay_load": "",
+        "design_authorized_total_mass": "2090/2090 (kg)",
+        "design_authorized_towed_mass": "",
+        "permissible_number_of_pers_carried": "5 chỗ ngồi, 0 chỗ đứng, 0 chỗ nằm",
+        "type_of_fuel_used": "Xăng",
+        "engine_displacement": "2996 (cm3)",
+        "max_output_per_rpm": "170(kW)/6000vph",
+        "number": "KC-1292285"
+    },
+    "inspection_report_number": "2905V-20953/16",
+    "valid_until": "31/01/2018",
+    "place_date_of_issue": "Hà Nội, ngày 1 tháng 8 năm 2016",
+    "inspection_center": "ĐƠN VỊ KIỂM ĐỊNH XE CƠ GIỚI",
+    "signature": "Ngọc Tuấn",
+    "equipped_with_tachograph": "",
+    "inspection_stamp_was_not_issued": "",
+    "notes": "Biển đăng ký nền trắng"
+}
+```
+**Example 05: Receipt**
+<div style="display: flex; flex-direction: row; align-items: center; justify-content: center;">
+  <div style="text-align: center; margin: 10 20px;">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/63d8d8879dfcfa941d4d7cd9/40vIbNdM1cFXwQYNHx7Ag.jpeg" width="500"/>
+  </div>
+</div>
+<p style="text-align: center; font-size: 12px; color: gray; margin-top: 10px;">
+  Source: <a href="https://tintucketoan.com/cach-viet-hoa-don-hang-hoa-dich-vu-khong-chiu-thue-gtgt/" target="_blank">https://tintucketoan.com/</a>
+</p>
+```
+{
+    'Mẫu số': '01GKTKT3/001',
+    'Ký hiệu': 'TC/18P',
+    'Số': '0000030',
+    'Họ tên người mua hàng': None,
+    'Tên đơn vị': 'Công Ty TNHH Kế Toán Hà Nội',
+    'Mã số thuế': '0106235869',
+    'Địa chỉ': 'Số 49 Ngõ 322 Lê Trọng Tấn, phường Khương Mai, quận Thanh Xuân, Hà Nội',
+    'Hình thức thanh toán': 'TM',
+    'STT': None,
+    'Tên hàng hóa, dịch vụ': 'Tra cứu phần mềm thư viện pháp luật trực tuyến',
+    'Đơn vị tính': None,
+    'Số lượng': None,
+    'Đơn giá': '168.000',
+    'Thành tiền': '2.016.000',
+    'Thuế suất GTGT': None,
+    'Tiền thuế GTGT': None,
+    'Tổng cộng tiền thanh toán': '2.016.000',
+    'Số tiền viết bằng chữ': 'Hai triệu, không trăm mười sáu nghìn đồng',
+    'Người bán hàng': 'Bùi Văn Hùng',
+    'Chức vụ người bán hàng': 'TRƯỞNG CHI NHÁNH'
+}
+```
+### 2.1 Image Captioning
+<div align="center">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/63d8d8879dfcfa941d4d7cd9/g5V60A7rI94TH0z3zdSAA.jpeg" width="700"/>
+</div>
+Hình ảnh là biểu đồ BMI theo tuổi, thể hiện mối quan hệ giữa chỉ số khối cơ thể (BMI) và độ tuổi của trẻ em. Biểu đồ được chia thành các vùng màu khác nhau tương ứng với các mức BMI khác nhau:
+* **Vùng màu đỏ:**  Chỉ số BMI cao hơn 25, cho thấy tình trạng béo phì.
+* **Vùng màu vàng:** Chỉ số BMI nằm trong khoảng từ 18 đến 25, cho thấy nguy cơ béo phì.
+* **Vùng màu xanh lá cây nhạt:** Chỉ số BMI nằm trong khoảng từ 16 đến 18, cho thấy sức khỏe dinh dưỡng tốt.
+* **Vùng màu xanh lá cây đậm:** Chỉ số BMI thấp hơn 16, cho thấy tình trạng thiếu cân.
+Trục tung biểu diễn chỉ số BMI, trục hoành biểu diễn tuổi (tính bằng năm). Đường cong màu xám đậm thể hiện đường chuẩn BMI theo tuổi. Các đường cong này cho thấy sự thay đổi BMI theo thời gian, giúp đánh giá sự phát triển cân nặng của trẻ em. Ví dụ, ở trẻ em dưới 3 tuổi, BMI thường dao động trong vùng thiếu cân hoặc sức khỏe dinh dưỡng tốt. Khi trẻ lớn lên, BMI có xu hướng tăng dần, nhưng tốc độ tăng trưởng có thể khác nhau tùy thuộc vào từng cá nhân.  Biểu đồ cũng hiển thị các phần trăm phân vị (Percentile), cho biết tỷ lệ phần trăm trẻ em có BMI thấp hơn hoặc cao hơn so với một nhóm trẻ em cùng độ tuổi. Điều này giúp so sánh BMI của trẻ em với tiêu chuẩn quốc tế.
+### 2.2 Image Captioning
+<div align="center">
+  <img src="https://huggingface.co/erax-ai/EraX-VL-7B-V1.5/resolve/main/images/27vid-Gaza-City-Cover-gqmt-videoSixteenByNine1050%20(1).jpg" width="700"/>
+</div>
+Hình ảnh chụp một cảnh tượng đầy xúc động và bi thảm, dường như diễn ra ở một khu vực nghèo khó, có thể là một khu định cư hoặc khu ổ chuột. Trung tâm của bức ảnh là một chiếc xe đẩy được kéo bởi một con lừa. Trên xe đẩy có một nhóm người, bao gồm một người đàn ông lớn tuổi có vẻ như là người hướng dẫn, một phụ nữ mặc áo choàng đen, một phụ nữ trẻ mặc áo xám, một bé gái nhỏ được che mặt bằng khăn trùm đầu, và một cậu bé mặc áo xanh lá cây. Họ có vẻ như đang di chuyển từ một khu vực bị tàn phá sang một nơi khác. Xung quanh họ là những mảnh vụn, rác thải và những tòa nhà đổ nát, cho thấy một môi trường sống khó khăn và thiếu thốn. Phía sau họ, có một số người khác đang đi bộ hoặc đi xe đạp, một số người đang mang theo hành lý. Bầu trời trong xanh, nhưng không khí chung của bức ảnh là nặng nề và buồn bã. Có thể đây là một cảnh tượng di cư, tị nạn, hoặc một sự kiện thảm khốc nào đó.
+### 3. Multi-turns Visual Question Answering and Reasoning
+<div style="display: flex; flex-direction: row; align-items: center; justify-content: center;">
+  <div style="text-align: center; margin: 10 20px;">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/63d8d8879dfcfa941d4d7cd9/S2UosY_TOOV5XNn3d4_zL.jpeg" width="700"/>
+  </div>
+</div>
+<p style="text-align: center; font-size: 12px; color: gray; margin-top: 10px;">
+  Source: <a href="https://kenh14.vn/bien-cang-thai-trinh-nhap-vien-vi-ngo-doc-dang-dan-to-1-resort-5-sao-thieu-trach-nhiem-voi-khach-hang-20200924214133912.chn" target="_blank">https://kenh14.vn</a>
+</p>
+- **User**: Hãy cho biết thông tin bệnh nhân
+- **Assistant**: Tên: NGUYỄN THỊ THÁI TRINH, Ngày sinh: 09/09/1993, Giới tính: Nữ, Địa chỉ: TÂN ĐỊNH - Q1 - TPHCM, Mã thẻ BHYT: 19169080, Số khám bệnh: 2569501, Mã khu vực (K1/K2/K3): 2
+- **User**: Họ được chuẩn đoán bị bệnh gì?
+- **Assistant**: Nhiễm trùng đường ruột do vi khuẩn, không xác định loài
+- **User**: Người bệnh có tiền sử bệnh không? Nếu có thì là gì?
+- **Assistant**: Thiếu Calci do chế độ ăn
+- **User**: Bệnh nhân đã thực hiện những xét nghiệm gì?
+- **Assistant**: [NQ286] Tổng phân tích tế bào máu ngoại vi (bằng máy đếm laser), [NQ286] Điện giải đồ (Na, K, Cl) [Máu], [NQ286] Đo hoạt độ ALT (GPT) [Máu], [NQ286] Đo hoạt độ AST (GOT) [Máu], [NQ286] Định lượng Calci toàn phần [Máu], [NQ286] Định lượng Creatinin [Máu], [NQ286] Định lượng Glucose [Máu], [NQ286] Định lượng Urê máu [Máu], [NQ286] Siêu âm ổ bụng (gan mật, tụy, lách, thận, bàng quang)
+- **User**: Số tiền Vật tư y tế mà người bệnh phải trả là bao nhiêu?
+- **Assistant**: 19,708
+- **User**: Các loại thuốc dịch truyền người bệnh phải trả là gì, tổng số tiền là bao nhiêu?
+- **Assistant**: Atropin Sulfat (Syv2019), Bicefzidim 1g (Syv2019), GONCAL (Syv2019), Lactated Ringer's-500ml (Syv2019), Nước cất pha tiêm 5ml (Syv2019), Sodium Chloride 0.9% -500ml (Syv2019), Vincomid (Syv2019), Vinopa (Syv2019), tổng cộng 45,234 đồng
+## Quickstart 🎮
+Install the necessary packages:
+```curl
+python -m pip install git+https://github.com/huggingface/transformers accelerate
+python -m pip install qwen-vl-utils
+pip install flash-attn --no-build-isolation
+```
+Then you can use `EraX-VL-7B-V1.5` like this:
+```python
+import os
+import base64
+import json
+import cv2
+import numpy as np
+import matplotlib.pyplot as  plt
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+model_path = "erax/EraX-VL-7B-V1.5"
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="eager", # replace with "flash_attention_2" if your GPU is Ampere architecture
+    device_map="auto"
+)
+tokenizer =  AutoTokenizer.from_pretrained(model_path)
+# processor = AutoProcessor.from_pretrained(model_path)
+min_pixels = 256 * 28 * 28
+max_pixels = 1280 * 28 * 28
+processor = AutoProcessor.from_pretrained(
+     model_path,
+     min_pixels=min_pixels,
+     max_pixels=max_pixels,
+ )
+image_path ="image.jpg"
+with open(image_path, "rb") as f:
+    encoded_image = base64.b64encode(f.read())
+decoded_image_text = encoded_image.decode('utf-8')
+base64_data = f"data:image;base64,{decoded_image_text}"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": base64_data,
+            },
+            {
+                "type": "text",
+                "text": "Trích xuất thông tin nội dung từ hình ảnh được cung cấp."
+            },
+        ],
+    }
+]
+# Prepare prompt
+tokenized_text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[ tokenized_text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Generation configs
+generation_config =  model.generation_config
+generation_config.do_sample   = True
+generation_config.temperature = 1.0
+generation_config.top_k       = 1
+generation_config.top_p       = 0.9
+generation_config.min_p       = 0.1
+generation_config.best_of     = 5
+generation_config.max_new_tokens     = 2048
+generation_config.repetition_penalty = 1.06
+# Inference
+generated_ids = model.generate(**inputs, generation_config=generation_config)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text[0])
+```
+## References 📑
+[1] Qwen team. Qwen2-VL. 2024.
+[2] Bai, Jinze, et al. "Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond." arXiv preprint arXiv:2308.12966 (2023).
+[4] Yang, An, et al. "Qwen2 technical report." arXiv preprint arXiv:2407.10671 (2024).
+[5] Chen, Zhe, et al. "Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2024.
+[6] Chen, Zhe, et al. "How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites." arXiv preprint arXiv:2404.16821 (2024).
+[7] Tran, Chi, and Huong Le Thanh. "LaVy: Vietnamese Multimodal Large Language Model." arXiv preprint arXiv:2404.07922 (2024).
+## Contact 🤝
+- For correspondence regarding this work or inquiry for API trial, please contact Nguyễn Anh Nguyên at [nguyen@erax.ai](nguyen@erax.ai).
+- Follow us on <b><a href="https://github.com/EraX-JS-Company" target="_blank">EraX Github</a></b>

added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "EraX-VL-7B-V1.5",
+  "architectures": [
+    "Qwen2VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "in_chans": 3,
+    "model_type": "qwen2_vl",
+    "spatial_patch_size": 14
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 152064
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "attn_implementation": "flash_attention_2",
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.01,
+  "top_k": 1,
+  "top_p": 0.001,
+  "transformers_version": "4.47.0.dev0"
+}

images/27vid-Gaza-City-Cover-gqmt-videoSixteenByNine1050 (1).jpg ADDED Viewed

images/nguyentonnhuan.jpg ADDED Viewed

images/nguyenvandung_back.png ADDED Viewed

images/nguyenvandung_front.png ADDED Viewed

images/test.txt ADDED Viewed

File without changes

images/trinhquangduy_back.jpg ADDED Viewed

images/trinhquangduy_front.jpg ADDED Viewed

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_language_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd30330595dccf86956386c695929ea329c110269d94b369c20ecae2f0fbe50
+size 3936211968

openvino_language_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_text_embeddings_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ddd207deedbbafaa2e15c910325fbbb1658491246ec356b9dd33593457e9b2e
+size 1089994756

openvino_text_embeddings_model.xml ADDED Viewed

	@@ -0,0 +1,107 @@

+<?xml version="1.0"?>
+<net name="Model0" version="11">
+	<layers>
+		<layer id="0" name="input" type="Parameter" version="opset1">
+			<data shape="?,?" element_type="i64" />
+			<output>
+				<port id="0" precision="I64" names="input">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="self.weight" type="Const" version="opset1">
+			<data element_type="bf16" shape="152064, 3584" offset="0" size="1089994752" />
+			<output>
+				<port id="0" precision="BF16" names="self.weight">
+					<dim>152064</dim>
+					<dim>3584</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="ov_ext::embedding/Convert" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="BF16">
+					<dim>152064</dim>
+					<dim>3584</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>152064</dim>
+					<dim>3584</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="ov_ext::embedding/Convert_1" type="Convert" version="opset1">
+			<data destination_type="i32" />
+			<input>
+				<port id="0" precision="I64">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="I32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="ov_ext::embedding/Constant" type="Const" version="opset1">
+			<data element_type="i32" shape="" offset="1089994752" size="4" />
+			<output>
+				<port id="0" precision="I32" />
+			</output>
+		</layer>
+		<layer id="5" name="ov_ext::embedding/Gather" type="Gather" version="opset8">
+			<data batch_dims="0" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>152064</dim>
+					<dim>3584</dim>
+				</port>
+				<port id="1" precision="I32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="2" precision="I32" />
+			</input>
+			<output>
+				<port id="3" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3584</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="Result_9" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+					<dim>3584</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="2" from-port="1" to-layer="5" to-port="0" />
+		<edge from-layer="3" from-port="1" to-layer="5" to-port="1" />
+		<edge from-layer="4" from-port="0" to-layer="5" to-port="2" />
+		<edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2024.6.0-17404-4c0f47d2335-releases/2024/6" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

openvino_vision_embeddings_merger_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:708721814872d85ce7faf6e2ac54b3d0225c029a5293704a84602e352c4943e3
+size 359928592

openvino_vision_embeddings_merger_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino_vision_embeddings_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85a63e6b6f0f4755f31e9c05cd97299215cb157882242dc3683781cd9f3349d4
+size 3010616

openvino_vision_embeddings_model.xml ADDED Viewed

	@@ -0,0 +1,167 @@

+<?xml version="1.0"?>
+<net name="Model9" version="11">
+	<layers>
+		<layer id="0" name="hidden_states" type="Parameter" version="opset1">
+			<data shape="?,?" element_type="f32" />
+			<output>
+				<port id="0" precision="FP32" names="hidden_states">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="Constant_131546" type="Const" version="opset1">
+			<data element_type="i64" shape="5" offset="0" size="40" />
+			<rt_info>
+				<attribute name="precise" version="0" />
+			</rt_info>
+			<output>
+				<port id="0" precision="I64" names="8">
+					<dim>5</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="aten::view/Reshape" type="Reshape" version="opset1">
+			<data special_zero="false" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>-1</dim>
+				</port>
+				<port id="1" precision="I64">
+					<dim>5</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="14,9,hidden_states">
+					<dim>-1</dim>
+					<dim>3</dim>
+					<dim>2</dim>
+					<dim>14</dim>
+					<dim>14</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="self.proj.weight_compressed" type="Const" version="opset1">
+			<data element_type="f16" shape="1280, 3, 2, 14, 14" offset="40" size="3010560" />
+			<output>
+				<port id="0" precision="FP16" names="self.proj.weight">
+					<dim>1280</dim>
+					<dim>3</dim>
+					<dim>2</dim>
+					<dim>14</dim>
+					<dim>14</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="self.proj.weight" type="Convert" version="opset1">
+			<data destination_type="f32" />
+			<rt_info>
+				<attribute name="decompression" version="0" />
+			</rt_info>
+			<input>
+				<port id="0" precision="FP16">
+					<dim>1280</dim>
+					<dim>3</dim>
+					<dim>2</dim>
+					<dim>14</dim>
+					<dim>14</dim>
+				</port>
+			</input>
+			<output>
+				<port id="1" precision="FP32">
+					<dim>1280</dim>
+					<dim>3</dim>
+					<dim>2</dim>
+					<dim>14</dim>
+					<dim>14</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="__module.proj/aten::_convolution/Convolution" type="Convolution" version="opset1">
+			<data strides="2, 14, 14" dilations="1, 1, 1" pads_begin="0, 0, 0" pads_end="0, 0, 0" auto_pad="explicit" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>3</dim>
+					<dim>2</dim>
+					<dim>14</dim>
+					<dim>14</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>1280</dim>
+					<dim>3</dim>
+					<dim>2</dim>
+					<dim>14</dim>
+					<dim>14</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32" names="32">
+					<dim>-1</dim>
+					<dim>1280</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="6" name="Constant_131593" type="Const" version="opset1">
+			<data element_type="i64" shape="2" offset="3010600" size="16" />
+			<rt_info>
+				<attribute name="precise" version="0" />
+			</rt_info>
+			<output>
+				<port id="0" precision="I64" names="18">
+					<dim>2</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="7" name="aten::view/Reshape_1" type="Reshape" version="opset1">
+			<data special_zero="false" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1280</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+					<dim>1</dim>
+				</port>
+				<port id="1" precision="I64">
+					<dim>2</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>1280</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="8" name="Result_131595" type="Result" version="opset1">
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>1280</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="1" />
+		<edge from-layer="2" from-port="2" to-layer="5" to-port="0" />
+		<edge from-layer="3" from-port="0" to-layer="4" to-port="0" />
+		<edge from-layer="4" from-port="1" to-layer="5" to-port="1" />
+		<edge from-layer="5" from-port="2" to-layer="7" to-port="0" />
+		<edge from-layer="6" from-port="0" to-layer="7" to-port="1" />
+		<edge from-layer="7" from-port="2" to-layer="8" to-port="0" />
+	</edges>
+	<rt_info>
+		<Runtime_version value="2024.6.0-17404-4c0f47d2335-releases/2024/6" />
+		<conversion_parameters>
+			<framework value="pytorch" />
+			<is_python_object value="True" />
+		</conversion_parameters>
+	</rt_info>
+</net>

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 200704,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 200704
+  },
+  "temporal_patch_size": 2
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
+size 11420371

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "processor_class": "Qwen2VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff