-
GenAI – Audio File Chunking Process
GenAI – Audio File Chunking Process Table Of Content: How To Chunk Audio Files ? (1) Reference Links https://github.com/infiniflow/ragflow/blob/main/rag/app/audio.py (2) How To Chunk Audio Files ? Imported File Links: https://github.com/infiniflow/ragflow/blob/main/api/db/__init__.py https://github.com/infiniflow/ragflow/blob/main/rag/nlp/rag_tokenizer.py https://github.com/infiniflow/ragflow/blob/main/api/db/services/llm_service.py https://github.com/infiniflow/ragflow/blob/main/rag/nlp/__init__.py import re from api.db import LLMType from rag.nlp import rag_tokenizer from api.db.services.llm_service import LLMBundle from rag.nlp import tokenize def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r".[a-zA-Z]+$", "", filename)) } doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) # is it English eng = lang.lower() == "english" # is_english(sections) try: callback(0.1, "USE Sequence2Txt LLM to transcription the audio") seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang) ans = seq2txt_mdl.transcription(binary) callback(0.8,
