Spaces:
Sleeping
Sleeping
Abid
commited on
Commit
·
ff0d701
1
Parent(s):
1c78367
eval edit
Browse files
eval.py
CHANGED
|
@@ -61,8 +61,6 @@ def normalize_text(text: str) -> str:
|
|
| 61 |
text = re.sub("['ّ]", '', text)
|
| 62 |
text = re.sub("['ٔ]", '', text)
|
| 63 |
text = re.sub("['ٰ]", '', text)
|
| 64 |
-
# batch["sentence"] = re.sub("[ء]", '', batch["sentence"])
|
| 65 |
-
# batch["sentence"] = re.sub("[آ]", 'ا', batch["sentence"])
|
| 66 |
text = re.sub("[ۂ]", 'ہ', text)
|
| 67 |
text = re.sub("[ي]", "ی",text)
|
| 68 |
text = re.sub("[ؤ]", "و", text)
|
|
@@ -74,15 +72,20 @@ def normalize_text(text: str) -> str:
|
|
| 74 |
# note that order is important here!
|
| 75 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
| 76 |
|
|
|
|
| 77 |
for t in token_sequences_to_ignore:
|
| 78 |
text = " ".join(text.split(t))
|
| 79 |
|
| 80 |
return text
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
def main(args):
|
| 84 |
# load dataset
|
| 85 |
dataset = load_dataset(args.dataset, args.config,delimiter="\t",split=args.split, use_auth_token=True)
|
|
|
|
| 86 |
|
| 87 |
# for testing: only process the first two examples as a test
|
| 88 |
# dataset = dataset.select(range(10))
|
|
@@ -92,7 +95,8 @@ def main(args):
|
|
| 92 |
sampling_rate = feature_extractor.sampling_rate
|
| 93 |
|
| 94 |
# resample audio
|
| 95 |
-
dataset = dataset.cast_column("
|
|
|
|
| 96 |
|
| 97 |
# load eval pipeline
|
| 98 |
if args.device is None:
|
|
@@ -102,7 +106,7 @@ def main(args):
|
|
| 102 |
# map function to decode audio
|
| 103 |
def map_to_pred(batch):
|
| 104 |
prediction = asr(
|
| 105 |
-
batch["
|
| 106 |
)
|
| 107 |
|
| 108 |
batch["prediction"] = prediction["text"]
|
|
|
|
| 61 |
text = re.sub("['ّ]", '', text)
|
| 62 |
text = re.sub("['ٔ]", '', text)
|
| 63 |
text = re.sub("['ٰ]", '', text)
|
|
|
|
|
|
|
| 64 |
text = re.sub("[ۂ]", 'ہ', text)
|
| 65 |
text = re.sub("[ي]", "ی",text)
|
| 66 |
text = re.sub("[ؤ]", "و", text)
|
|
|
|
| 72 |
# note that order is important here!
|
| 73 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
| 74 |
|
| 75 |
+
|
| 76 |
for t in token_sequences_to_ignore:
|
| 77 |
text = " ".join(text.split(t))
|
| 78 |
|
| 79 |
return text
|
| 80 |
|
| 81 |
+
def path_adjust(batch):
|
| 82 |
+
batch["path"] = "Data/ur/clips/"+str(batch["path"])
|
| 83 |
+
return batch
|
| 84 |
|
| 85 |
def main(args):
|
| 86 |
# load dataset
|
| 87 |
dataset = load_dataset(args.dataset, args.config,delimiter="\t",split=args.split, use_auth_token=True)
|
| 88 |
+
|
| 89 |
|
| 90 |
# for testing: only process the first two examples as a test
|
| 91 |
# dataset = dataset.select(range(10))
|
|
|
|
| 95 |
sampling_rate = feature_extractor.sampling_rate
|
| 96 |
|
| 97 |
# resample audio
|
| 98 |
+
dataset = dataset.cast_column("path", path_adjust())
|
| 99 |
+
dataset = dataset.cast_column("path", Audio(sampling_rate=sampling_rate))
|
| 100 |
|
| 101 |
# load eval pipeline
|
| 102 |
if args.device is None:
|
|
|
|
| 106 |
# map function to decode audio
|
| 107 |
def map_to_pred(batch):
|
| 108 |
prediction = asr(
|
| 109 |
+
batch["path"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
| 110 |
)
|
| 111 |
|
| 112 |
batch["prediction"] = prediction["text"]
|