Source code for yt_audio_collector.system_1.valid_transcript

import re
from typing import List

from yt_audio_collector.constants import HINDI_RE_PATTERN
from yt_audio_collector.system_1.video_to_audio import duration_of_video



[docs]
def is_valid_hindi_transcript(transcript: List[dict], video_id: str) -> bool:
    """
    Checks if the given transcript is valid:
    1. The transcript must be in Hindi.
    2. Exists for the full video without empty text.

    Parameters:
    -----------
    transcript: `List[dict]`
        A list of transcriptions of a video.
    video_id: `str`
        The ID of the video.

    Return:
    -------
    bool
        True if the transcript is valid, False otherwise.
    """
    transcript_length = len(transcript)
    empty_text_count = 0
    hindi_to_total_text_ratio = 0
    subtitles_duration = 0
    for i in range(transcript_length):
        transcript_text = transcript[i].get("text")
        # Check for empty transcript text
        if re.sub("[\s+]", "", transcript_text) == "":
            empty_text_count += 1
            # If there are more than 10 empty texts, the transcript is invalid
            if empty_text_count > 10:
                return False
            continue
        # Find all hindi characters in single transcript text
        hindi_chars = re.findall(HINDI_RE_PATTERN, transcript_text)
        len_hindi_chars = len(hindi_chars)
        len_total_chars = len(transcript_text)
        subtitles_duration += transcript[i].get("duration")
        # Calculate the ratio of Hindi characters to total characters
        # of transcript
        if hindi_to_total_text_ratio == 0:
            hindi_to_total_text_ratio = round(len_hindi_chars / len_total_chars, 4)
        else:
            hindi_to_total_text_ratio = round(
                (hindi_to_total_text_ratio + len_hindi_chars / len_total_chars) / 2, 4
            )
    # Check if subtitles duration is < 50% of total duration of video then transcript is invalid
    if subtitles_duration < 0.5 * duration_of_video(video_id):
        return False
    # Check if the ratio of Hindi characters to total characters < 40% then transcript is invalid
    if hindi_to_total_text_ratio * 100 < 40:
        return False
    # If all checks pass, the transcript is valid
    return True