Source code for libreasr.lib.builder

"""
ASRDatabunchBuilder is responsible for loading datasets
in csv format.
"""

import os
from pathlib import Path

import numpy as np
import pandas as pd

import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from libreasr.lib.utils import sanitize_str

CSV = {
    "train": "asr-dataset-train.csv",
    "valid": "asr-dataset-valid.csv",
    "test": "asr-dataset-test.csv",
}

TOKENIZER_MODEL_FILE = "tmp/tokenizer.yttm-model"
CORPUS_FILE = "tmp/corpus.txt"


[docs]class ASRDatabunchBuilder:
    def __init__(self):
        self.do_shuffle = False
        self.mode = None

    @staticmethod
    def from_config(conf, mode):
        paths = [conf["dataset_paths"][x] for x in conf["datasets"]]
        pcent = conf["pcent"][mode]
        builder = ASRDatabunchBuilder().set_mode(mode).multi(paths, pcent)
        if conf["apply_limits"]:
            builder = (
                builder.x_bounds(conf["almins"] * 1000.0, conf["almaxs"] * 1000.0)
                .y_bounds(conf["y_min"], conf["y_max"])
                .set_max_words(conf["y_max_words"])
            )
            if conf["shuffle_builder"][mode]:
                builder.shuffle()
        builder.build()
        return builder

    def set_mode(self, mode):
        self.mode = mode
        return self

    def single(self, path):
        path = Path(path)
        self.df = pd.read_csv(path / CSV[self.mode])
        return self

    def multi(self, paths, pcent=1.0):
        dfs = []
        for path in paths:
            path = Path(path)
            df = pd.read_csv(path / CSV[self.mode])
            if pcent != 1.0:
                df = df.sample(frac=pcent)
            dfs.append(df)
        # TODO set sort=False at some point (as it is not needed)
        self.df = pd.concat(dfs, ignore_index=True, copy=False, sort=True)
        return self

    def x_bounds(self, x_min, x_max):
        self.x_min = x_min
        self.x_max = x_max
        return self

    def y_bounds(self, y_min, y_max):
        self.y_min = y_min
        self.y_max = y_max
        return self

    def set_max_words(self, max_words):
        self.max_words = max_words
        return self

    def shuffle(self, b=True):
        self.do_shuffle = b
        return self

    def _assert_has_df(self):
        assert hasattr(self, "df")

    def _apply_limits(self):
        df = self.df
        if hasattr(self, "x_min"):
            df = df[df.xlen >= self.x_min]
        if hasattr(self, "x_max"):
            df = df[df.xlen <= self.x_max]
        if hasattr(self, "y_min"):
            df = df[df.ylen >= self.y_min]
        if hasattr(self, "y_max"):
            df = df[df.ylen <= self.y_max]
        if hasattr(self, "max_words"):

            def filt(row):
                l = sanitize_str(row.label)
                if len(l.split(" ")) <= self.max_words and len(l) > 5:
                    return True
                return False

            df = df[df.apply(filt, axis=1)]
        self.df = df

    def _fix_columns(self):
        df = self.df
        df["label"] = df["label"].astype(str)
        df["sr"] = df["sr"].astype(int)
        self.df = df

    def _remove_columns(self):
        try:
            self.df.drop(["bad", "lang"], axis="columns", inplace=True)
        except:
            pass

    def build(self, sort=False, seed=42):
        self._assert_has_df()
        self._fix_columns()
        self._apply_limits()
        self._remove_columns()
        if self.do_shuffle:
            self.df = self.df.sample(frac=1, random_state=seed)
        if sort:
            self.df.sort_values(by="label", inplace=True)
        self.built = True
        return self

    def get(self):
        df = self.df
        _fs = df.file.tolist()
        _is = list(range(len(_fs)))
        _ts = list(df.itertuples(index=False))
        return _fs, _is, _ts, df

    def print(self):
        if self.built:
            print("mode:", self.mode)
            print("num samples:", len(self.df))
            print("num hours:", self.df.xlen.values.sum() / (1000.0 * 3600.0))
            print(self.df.head())
        return self

    def dump_labels(self, to_file=CORPUS_FILE):
        assert self.built
        print(f"Dumping labels to {to_file}")
        os.makedirs(Path(to_file).parent, exist_ok=True)
        with open(to_file, "w") as f:
            for i, row in tqdm.tqdm(self.df.iterrows(), total=len(self.df)):
                f.write(sanitize_str(row.label) + "\n")
        print("Done.")

    def train_tokenizer(
        self,
        corpus_file=CORPUS_FILE,
        model_file=TOKENIZER_MODEL_FILE,
        vocab_sz=50000,
        dump_labels=True,
    ):
        assert self.built
        import youtokentome as yttm

        # first we need to dump labels
        if dump_labels:
            self.dump_labels(corpus_file)

        # train model
        print("Training yttm model...")
        yttm.BPE.train(data=corpus_file, vocab_size=vocab_sz, model=model_file)
        print("Done.")

        # load model (for testing)
        print("Testing yttm model...")
        bpe = yttm.BPE(model=model_file)
        # Two types of tokenization
        test_text = "Are you freakin' crazy?"
        encoded1 = bpe.encode([test_text], output_type=yttm.OutputType.ID)
        encoded2 = bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD)
        decoded = bpe.decode(encoded1)
        print(encoded1)
        print(encoded2)
        print(decoded)

    def plot(self, save=False):
        if self.built:
            import matplotlib.pyplot as plt

            plt.hist(self.df.xlen.values, bins=50)
            plt.title(f"xlen ({self.df.xlen.values.sum() / 3600000.:.2f} hours)")
            plt.show()
            if save:
                plt.savefig("./plots/figures/data-x.png", dpi=300)
            plt.hist(self.df.ylen.values, bins=30)
            plt.title("ylen")
            plt.show()
            if save:
                plt.savefig("./plots/figures/data-y.png", dpi=300)
            plt.hist(self.df.xlen.values / self.df.ylen.values, bins=50)
            plt.title("xlen/ylen")
            plt.show()
            if save:
                plt.savefig("./plots/figures/data-x-y.png", dpi=300)
        return self


if __name__ == "__main__":
    fs, _is, ts, df = (
        ASRDatabunchBuilder()
        .multi(
            [
                "/data/stt/data/common-voice/fr",
                "/data/stt/data/common-voice/de",
                "/data/stt/data/common-voice/en",
            ]
        )
        .x_bounds(0, 5000)
        .y_bounds(2, 20)
        .build()
        .print()
        .plot()
        .get()
    )
    print(fs[0], _is[0], ts[0])
    print("\n\nhead:", df.head(n=20))
    print("\n\ntail:", df.tail(n=20))