File size: 2,733 Bytes
6d5047c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Text prompt sanitization for motion generation (whitespace, punctuation, capitalization)."""


def sanitize_text(text: str, paragraph: bool = True) -> str:
    """Sanitize a text prompt: strip, collapse spaces, capitalize, trim non-alphanumeric, add/fix final punctuation.

    Args:
        text: Input text prompt.
        paragraph: If True, capitalize after each sentence break and normalize spacing between sentences.

    Returns:
        Sanitized text.
    """
    # remove any trailing or leading whitespace
    text = text.strip()

    # https://stackoverflow.com/a/1546251
    # replace duplicate spaces by one space
    text = " ".join(text.split())

    if text == "":
        return text

    # removing leading non alpha numeric characters
    for i, c in enumerate(text):
        if not str.isalnum(c):
            continue
        break
    text = text[i:]

    # Capitalize
    text = text.capitalize()

    final_punctuations = ".!?\"])'"
    # removing trailing non alpha numeric characters
    # expect final punctuations
    for i, c in reversed(list(enumerate(text))):
        if not str.isalnum(c) and c not in final_punctuations:
            continue
        break
    text = text[: i + 1]

    # Adding period at the end if needed
    if text[-1] not in ".!?":
        text = text + "."

    if paragraph:
        # fix end of sentences if several sentences
        for sentence_break in ".!?":
            subtexts = text.split(sentence_break)
            text = f"{sentence_break} ".join(  # put back a space after the break
                [
                    y[0].capitalize() + y[1:]  # only capitalize the first character
                    if y
                    else y  # y is empty at the end
                    for x in subtexts
                    for y in [x.strip()]  # remove extra spaces
                ]
            ).strip()  # remove extra space at the end
    return text


def sanitize_texts(texts: list[str]) -> list[str]:
    """Sanitize each text prompt in the list (see sanitize_text).

    Args:
        texts: List of input text prompts.

    Returns:
        List of sanitized texts.
    """
    return [sanitize_text(text) for text in texts]


if __name__ == "__main__":
    texts = [
        " A person is    walking.",
        "someone go forward",
        "jump",
        "jumping!",
        "jumping)",
        "-go",
        "blocasdji  -----",
        "",
    ]

    print("Old texts")
    print("\n".join(texts))
    print()

    new_texts = sanitize_texts(texts)
    print("Sanitized texts")
    print("\n".join(new_texts))