File size: 4,418 Bytes
2fc6c45
d0c2b7c
 
 
2fc6c45
d0c2b7c
 
 
 
 
2fc6c45
d0c2b7c
 
 
 
2fc6c45
 
d0c2b7c
 
 
2fc6c45
d0c2b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc6c45
d0c2b7c
 
2fc6c45
d0c2b7c
 
 
 
2fc6c45
d0c2b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc6c45
d0c2b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc6c45
d0c2b7c
 
 
 
 
 
 
 
 
2fc6c45
 
 
d0c2b7c
 
 
2fc6c45
 
d0c2b7c
 
099ffc5
 
2fc6c45
 
 
 
d0c2b7c
2fc6c45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import re
import gradio as gr
from collections import deque

#import local package
import music_search
from process import process_images, process_audio_video
from html_image import setup_chrome, html_to_image, render_abc
from response import get_zhipuai_response

setup_chrome()

# Initialize memory with a deque (double-ended queue) to store up to 5 rounds
memory = deque(maxlen=10)


class State():
    def __init__(self):
        self.state = self.init()

    def init(self):
        self.prev_image_result = None
        self.prev_image_files = None
        self.prev_media_result = None
        self.prev_media_file = None
        self.prev_media_viewer = None
    def image_state_update(self, result, files):
        self.prev_image_result = result
        self.prev_image_files = files

    def media_state_update(self, result, file, viewer):
        self.prev_media_result = result
        self.prev_media_file = file
        self.prev_media_viewer = viewer

state = State()
def process_input(text=None, images=None, media=None):
    print("Starting process_input")
    system = "1.你是一个音乐专家,只能回答音乐知识,和打招呼,回复的内容为普通文本格式,不用任何markdown符号如加粗等。如果提供的乐谱是abc记谱法,则回复时不要用abc记谱法,需要使用专业音乐词汇和自然语言进行回答问题\n2.你将根据下面指令回答问题,但是不能违反第一条指令,也不能在回复中提及。"
    messages = [{"role": "system", "content": system}]
    
    #变量初始化
    prompt = ""
    abc = False
    abcfile = None

    # 处理文本输入
    if text:
        print("Processing text input")
        prompt += f"用户指令: {text}."
        abc = music_search.is_search(prompt)
        if abc:
            memory.clear()
            state.init()
            prompt += f"找到了用户搜的曲子,根据指令简略解读一下:{abc}"

        # 处理图片输入
        if images:
            if state.prev_image_files and set(images) == set(state.prev_image_files):
                print("Using previous image result")
            else:
                print("Processing images")
                memory.clear()
                state.init()
                prompt += process_images(images)
                state.image_state_update(prompt, images)
    
        # 处理音频/视频输入
        if media:
            is_video = True if media[-3:] == "mp4" else False
            #is_video = True
            if state.prev_media_result and media.name == state.prev_media_file.name:
                print("Using previous video result")
            else:
                print("Processing media")
                memory.clear()
                state.init()
                result, result_viewer_path = process_audio_video(media, is_video = is_video)
                prompt += result
                state.media_state_update(result, media, result_viewer_path)

    # 将历史对话从 memory 加入到 messages 中
    for past in memory:
        messages.append({"role": "user", "content": "这是前几轮指令内容,根据需求读取这些内容:"+past["prompt"]})

    response = get_zhipuai_response(messages, prompt)

    current_conversation = {"prompt": prompt, "response": response}   # 更新当前对话的回复
    memory.append(current_conversation)  # 保存当前对话到历史中
    
    media_output = f"""<iframe src="{state.prev_media_viewer}" width="100%" height="600"></iframe>""" if state.prev_media_viewer else ""
    abc_image_output =  render_abc(abc) if abc else "1"
    #print(response)
    #print(video_output)
    #print(abc_image_output)
    return response, media_output, abc_image_output


# Create Gradio interface
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Textbox(label="Input Text", placeholder="我是音乐多模态大模型,您可以上传需要分析的曲谱,音频和视频", lines=2),
        gr.File(label="Input Images", file_count="multiple", type="filepath"),
        gr.File(label="Input media, mp3 or mp4", type="filepath"),
    ],
    outputs=[
        gr.Textbox(label="Output Text", interactive=True),  # Enable streaming in the output
        gr.HTML(label="Video Viewer"),
        #gr.Image(label="Image Viewer", type="filepath")
        gr.HTML()
    ],
    live=False,
)

# Launch Gradio application
iface.launch()