Giter Club home page Giter Club logo

uni-tts's People

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar

uni-tts's Issues

关于AR模型的问题

hi,我想咨询下推理核心代码中from AR.models.t2s_lightning_module import Text2SemanticLightningModule,使用的就是官方的AR模块吗,您主要针对这一部分做过哪些推理加速呢?

源码不可用,缺少text依赖

from text import cleaned_text_to_sequence
from text.cleaner import clean_text

源码不可用啊,有说法么,上面两个依赖,不在pipy么

请问模型管理 情绪哪里 是选英文的emotion_list还是自己定义情绪名字

请问模型管理 情绪哪里 是选英文的emotion_list还是自己定义情绪名字
我看B站的花火情绪是用中文自己定义的 https://www.bilibili.com/video/BV1dy421z7YB
这和emotion_list有什么区别,哪个更适合,这两者有什么联系吗
"花火": [
"default",
"平常的",
"慢速病娇",
"傻白甜",
"平静的",
"疯批",
"聊天",
]

默认的emotion_list:
default
advertisement upbeat
affectionate
angry
assistant
calm
。。。。等等

测试了一下并发请求,多语言,多角色,发现并发推理声音会抽风,服务端有一半概率返回500

感谢大佬分享,这是测试脚本

# from text_utils.segmenter import SentenceSegmenter
# from text_utils.tokenizer import Tokenizer
#https://github.com/numb3r3/text_utils
# from polyglot.text import Text
import requests,re,jieba
import concurrent.futures

text = '''皆さん、我在インターネット上看到someone把几国language混在一起speak。我看到之后be like:それは我じゃないか!私もtry一tryです。虽然是混乱している句子ですけど、中文日本語プラスEnglish、挑戦スタート!我study日本語的时候,もし有汉字,我会很happy。Bueause**人として、when I see汉字,すぐに那个汉字がわかります。But 我hate外来語、什么マクドナルド、スターバックス、グーグル、ディズニーランド、根本记不住カタカナhow to写、太難しい。以上です,byebye!'''
text = '''皆さん我在インターネット上看到someone把几国!language混在一起speak我看到之后be like それは我じゃないか私もtry一tryです虽然是混乱している句子ですけど中文日本語プラスEnglish挑戦スタート我study日本語的时候もし有汉字我会很happyBueause**人としてwhen I see汉字すぐに那个汉字がわかりますBut 我hate外来語什么マクドナルドスターバックスグーグルディズニーランド根本记不住カタカナhow to写太難しい以上ですbyebye!'''

# textlist = SentenceSegmenter(token_limits=10).segment(text)
# text_mixed = Text('\n'.join(textlist)).sentences
#
# for text in text_mixed:
#     print(text)
#     print('\n')

def split_text(text, max_length):
    text = text.lower()

    text = re.sub(r'[\,\,\;\;\、]+', ',', text)  # 制表符替换成空格
    text = re.sub(r'[\?\?]+', '?', text)  # 制表符替换成空格
    text = re.sub(r'[\!\!]+', '!', text)  # 制表符替换成空格
    text = re.sub(r'[\。\.]+', '.', text)  # 制表符替换成空格
    text = re.sub(r'[\"\'\’]+', "'", text)  # 制表符替换成空格
    text = re.sub(r'[\(\(\{\[\)\)\]\}\|\:\:\/\_]', ' ', text)  # 左括号替换成中文括号
    text = re.sub('[\n\r]+', ',', text)  # 换行符替换成逗号
    text = re.sub('[\s]+', ' ', text)  # 换行符替换成逗号

    symbols = {
        "&": "和",
        "\\": "斜杠",
        "/": "反斜杠",
        "+": "加",
        "-": "杠",
        "_": "杠",
        "*": "星",
        "%": "百分比",
        "=": "等"
    }
    # 遍历字典中的每一对键值
    for symbol, word in symbols.items():
        # 构造一个正则表达式,匹配中文旁边的特殊符号
        pattern = f"(?<=\u4e00-\u9fa5){re.escape(symbol)}(?=\u4e00-\u9fa5)"
        # 用对应的文字替换匹配到的特殊符号
        text = re.sub(pattern, word, text)

    symbols2 = {
        "a": "唉",
        "b": "比",
        "c": "西",
        "d": "第",
        "e": "衣",
        "f": "爱抚",
        "g": "记",
        "h": "唉取",
        "i": "爱",
        "j": "姐",
        "k": "克唉",
        "l": "唉欧",
        "m": "唉母",
        "n": "恩",
        "o": "欧",
        "p": "屁",
        "q": "口",
        "r": "阿",
        "s": "唉撕",
        "t": "体",
        "u": "油",
        "v": "喂",
        "w": "打不留",
        "x": "唉克撕",
        "y": "外",
        "z": "热",
    }

    # 遍历字典中的每一对键值
    for symbol, word in symbols2.items():
        # 构造一个正则表达式,匹配中文旁边的特殊符号
        pattern = f"(?<=[^a-zA-Z0-9\s]){re.escape(symbol)}(?=[^a-zA-Z0-9\s])"
        # 用对应的文字替换匹配到的特殊符号
        text = re.sub(pattern, word, text)

    text = re.sub(r'[\\\_\-\&\+\-\*\%\=]+(?![a-zA-Z\d])', '', text)  # 引号旁边没有英文则去掉



    '''
    最小值 = max_length*0.5
    最大值 = max_length*1.5
    对于result中的key是'zh'的项目的值 传入 concurrent.futures多线程  用jieba分词 ,分好的词进行累加,叠加词的长度进行范围判断,
    大于最小值后遇到[^\u4e00-\u9fa5\d]的元素叠加上后结束累加作为一个片段。如果没有遇到就继续累加,直到不超过最大值时停止,作为一个片段。
    如果一开始就大于max_length,也作为一个片段。片段按顺序组合成列表,重构对应result中的相应字典元素的值
    对于result中的key是'en'的项目的值,按字数累加,叠加字数长度进行范围判断,
    大于最小值后遇到[^a-zA-Z\d\s]的字叠加上后结束累加作为一个片段。如果没有遇到就继续累加,直到不超过最大值时,后面如果不是[^a-zA-Z],就以到最大值的相对值最小的符合[^a-zA-Z]的位置,作为一个片段。
    片段按顺序组合成列表,重构对应result中的相应字典元素的值
    无论 en 还是 zh 如果出现单独一段全是特殊符号空格的,就把这一段从元素值的列表中丢弃
    '''
    # 提示如下
    min_length = max_length * 0.5
    max_length = max_length * 1.8

    def split_words(text):
        words = jieba.cut(text)
        segments = []
        segment = ""
        length = 0
        for word in words:
            segment += word
            length += len(word)
            print(word)
            if length > min_length and (
                    re.search(r'[^\u4e00-\u9fa5\d]', word) or re.search(r'[^a-zA-Z\d\s\']', word)):
                segments.append(segment)
                segment = ""
                length = 0
            elif length > max_length:
                segments.append(segment)
                segment = ""
                length = 0

        if segment:
            segments.append(segment)
        return segments

    result = split_words(text)

    return result

def send(speaker,text,index):
    postdata = {
        "cha_name": speaker,
        "character_emotion": "default",
        "text": text,
        "text_language": "多语种混合",
        'batch_size': 4,
        "top_k": 6, "top_p": 0.8, "temperature": 0.8,
    }
    response = requests.post("http://127.0.0.1:5000/tts", json=postdata, timeout=25)
    with open(f'{str(index)}.wav','ab') as f:
        f.write(response.content)

index = 0

speakers = ['1980US','HuTao']
# speakers = ['1980US']

textlist = split_text(text, 20)

SK_TEXT_INDEX = [(speaker, text, index+1) for index, (text, speaker) in enumerate([(t, s) for t in textlist for s in speakers if t.strip()!=''])]

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    tasks = [executor.submit(send, a[0],a[1], a[2]) for a in SK_TEXT_INDEX]
concurrent.futures.wait(tasks)

角色选择bug

当post请求中设置非默认角色时,或使用默认角色的参考文本进行推理,导致除了默认角色外均无法正常生成音频

如下,默认角色为dingzhen
image
image

Flask 模块问题

这是我安装的flask:
PS D:\SSDOWN\TTS-for-GPT-soVITS-main> pip3 show flask
Name: Flask
Version: 2.1.2
Summary: A simple framework for building complex web applications.
Home-page: https://palletsprojects.com/p/flask
Author: Armin Ronacher
Author-email: [email protected]
License: BSD-3-Clause
Location: D:\program\python\Lib\site-packages
Requires: click, itsdangerous, Jinja2, Werkzeug
Required-by:
但是运行后显示:
Traceback (most recent call last):
File "D:\SSDOWN\GPT-SoVITS-0211\TTS-for-GPT-soVITS\tts_backend.py", line 2, in
from flask import Flask, request, Response, jsonify
ModuleNotFoundError: No module named 'flask'
我只安装了Python3:
Python 3.12.0 (tags/v3.12.0:0fb18b0, Oct 2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
大佬能解答一下吗?

python版本推理

老师是否以后会出python版本呢直接传入text就能合成音频

多語言合成失敗 RuntimeError: shape '[1, 1, 1, 159]' is invalid for input of size 144

前端处理后的文本(每句): ['我是一个粉刷匠,O H!Y E S。 ']
ERROR:tts_backend:Exception on /tts [POST]
Traceback (most recent call last):
File "C:\Users\Admin\GPT-SoVITS\runtime\lib\site-packages\flask\app.py", line 1463, in wsgi_app
response = self.full_dispatch_request()
File "C:\Users\Admin\GPT-SoVITS\runtime\lib\site-packages\flask\app.py", line 872, in full_dispatch_request
rv = self.handle_user_exception(e)
File "C:\Users\Admin\GPT-SoVITS\runtime\lib\site-packages\flask\app.py", line 870, in full_dispatch_request
rv = self.dispatch_request()
File "C:\Users\Admin\GPT-SoVITS\runtime\lib\site-packages\flask\app.py", line 855, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args) # type: ignore[no-any-return]
File "C:\Users\Admin\GPT-SoVITS\runtime\lib\site-packages\flask_httpauth.py", line 174, in decorated
return self.ensure_sync(f)(*args, **kwargs)
File "C:\Users\Admin\GPT-SoVITS\Inference\src\tts_backend.py", line 126, in tts
sampling_rate, audio_data = next(gen)
File "C:\Users\Admin\GPT-SoVITS\GPT_SoVITS\TTS_infer_pack\TTS.py", line 579, in run
pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
File "C:\Users\Admin\GPT-SoVITS\GPT_SoVITS\AR\models\t2s_model.py", line 553, in infer_panel
xy_padding_mask.view(bsz, 1, 1, src_len).expand(-1, self.num_head, -1, -1)
RuntimeError: shape '[1, 1, 1, 159]' is invalid for input of size 144
INFO:werkzeug:127.0.0.1 - - [12/Mar/2024 16:40:35] "POST /tts HTTP/1.1" 500 -

infer_panel() got an unexpected keyword argument 'top_p'

音频测试时提示
Traceback (most recent call last):
File "D:\study\python\aimodel\GPT-SoVITS-beta\TTS-for-GPT-soVITS\tts_backend.py", line 9, in
from character_manage import load_character, character_name, get_wav_from_text_api
File "D:\study\python\aimodel\GPT-SoVITS-beta\TTS-for-GPT-soVITS\character_manage.py", line 153, in
test_audio_save()
File "D:\study\python\aimodel\GPT-SoVITS-beta\TTS-for-GPT-soVITS\character_manage.py", line 147, in test_audio_save
fs, audio_to_save=get_wav_from_text_api("""这是一段音频测试""",'多语种混合')
File "D:\study\python\aimodel\GPT-SoVITS-beta\TTS-for-GPT-soVITS\character_manage.py", line 143, in get_wav_from_text_api
return get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, top_k=top_k, top_p=top_p, temperature=temperature, ref_free=ref_free)
File "D:\study\python\aimodel\GPT-SoVITS-beta\TTS-for-GPT-soVITS\inference_core.py", line 310, in get_tts_wav
pred_semantic, idx = t2s_model.model.infer_panel(
TypeError: infer_panel() got an unexpected keyword argument 'top_p'

对于 ImportError: cannot import name 'url_quote' from 'werkzeug.urls' 错误的解决方法

Traceback (most recent call last):  File "/root/GPT-SoVITSGIT/TTS-for-GPT-soVITS/tts_backend.py", line 2, in <module>
    from flask import Flask, request, Response, jsonify  File "/root/miniconda3/envs/GPTSoVits/lib/python3.9/site-packages/flask/__init__.py", line 7, in <module>    from .app import Flask as Flask  File "/root/miniconda3/envs/GPTSoVits/lib/python3.9/site-packages/flask/app.py", line 27, in <module>    from . import cli
  File "/root/miniconda3/envs/GPTSoVits/lib/python3.9/site-packages/flask/cli.py", line 17, in <module>    from .helpers import get_debug_flag
  File "/root/miniconda3/envs/GPTSoVits/lib/python3.9/site-packages/flask/helpers.py", line 14, in <module>
    from werkzeug.urls import url_quote
ImportError: cannot import name 'url_quote' from 'werkzeug.urls' (/root/miniconda3/envs/GPTSoVits/lib/python3.9/site-packages/werkzeug/urls.py)

解决方案,指定 werkzeug 的版本
pip install Werkzeug==2.2.2

感谢

非常棒的一个项目,正打算在做这个商业项目,感谢开源。如果能将训练的过程也做成API的话就更好了。目前只能调用预训练好的音色API。

ModuleNotFoundError: No module named 'text'

from text import cleaned_text_to_sequence
from text.cleaner import clean_text

请问这里的text包是怎么安装的呀,我pip和conda都找不到这个text包。
linux环境

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.