Source code for yt_audio_collector.system_1.valid_transcript

import re
from typing import List

from yt_audio_collector.constants import HINDI_RE_PATTERN
from yt_audio_collector.system_1.video_to_audio import duration_of_video


[docs] def is_valid_hindi_transcript(transcript: List[dict], video_id: str) -> bool: """ Checks if the given transcript is valid: 1. The transcript must be in Hindi. 2. Exists for the full video without empty text. Parameters: ----------- transcript: `List[dict]` A list of transcriptions of a video. video_id: `str` The ID of the video. Return: ------- bool True if the transcript is valid, False otherwise. """ transcript_length = len(transcript) empty_text_count = 0 hindi_to_total_text_ratio = 0 subtitles_duration = 0 for i in range(transcript_length): transcript_text = transcript[i].get("text") # Check for empty transcript text if re.sub("[\s+]", "", transcript_text) == "": empty_text_count += 1 # If there are more than 10 empty texts, the transcript is invalid if empty_text_count > 10: return False continue # Find all hindi characters in single transcript text hindi_chars = re.findall(HINDI_RE_PATTERN, transcript_text) len_hindi_chars = len(hindi_chars) len_total_chars = len(transcript_text) subtitles_duration += transcript[i].get("duration") # Calculate the ratio of Hindi characters to total characters # of transcript if hindi_to_total_text_ratio == 0: hindi_to_total_text_ratio = round(len_hindi_chars / len_total_chars, 4) else: hindi_to_total_text_ratio = round( (hindi_to_total_text_ratio + len_hindi_chars / len_total_chars) / 2, 4 ) # Check if subtitles duration is < 50% of total duration of video then transcript is invalid if subtitles_duration < 0.5 * duration_of_video(video_id): return False # Check if the ratio of Hindi characters to total characters < 40% then transcript is invalid if hindi_to_total_text_ratio * 100 < 40: return False # If all checks pass, the transcript is valid return True