| """ |
| functions to convert Arabic words/text into buckwalter encoding and vice versa |
| """ |
|
|
| import sys |
| import re |
| import utils |
|
|
| buck2uni = { |
| "'": u"\u0621", |
| "|": u"\u0622", |
| ">": u"\u0623", |
| "&": u"\u0624", |
| "<": u"\u0625", |
| "}": u"\u0626", |
| "A": u"\u0627", |
| "b": u"\u0628", |
| "p": u"\u0629", |
| "t": u"\u062A", |
| "v": u"\u062B", |
| "j": u"\u062C", |
| "H": u"\u062D", |
| "x": u"\u062E", |
| "d": u"\u062F", |
| "*": u"\u0630", |
| "r": u"\u0631", |
| "z": u"\u0632", |
| "s": u"\u0633", |
| "$": u"\u0634", |
| "S": u"\u0635", |
| "D": u"\u0636", |
| "T": u"\u0637", |
| "Z": u"\u0638", |
| "E": u"\u0639", |
| "g": u"\u063A", |
| "_": u"\u0640", |
| "f": u"\u0641", |
| "q": u"\u0642", |
| "k": u"\u0643", |
| "l": u"\u0644", |
| "m": u"\u0645", |
| "n": u"\u0646", |
| "h": u"\u0647", |
| "w": u"\u0648", |
| "Y": u"\u0649", |
| "y": u"\u064A", |
| "F": u"\u064B", |
| "N": u"\u064C", |
| "K": u"\u064D", |
| "a": u"\u064E", |
| "u": u"\u064F", |
| "i": u"\u0650", |
| "~": u"\u0651", |
| "o": u"\u0652", |
| "`": u"\u0670", |
| "{": u"\u0671", |
| } |
|
|
| |
| |
| uni2buck = {} |
|
|
| |
| for (key, value) in buck2uni.items(): |
| |
| |
| uni2buck[value] = key |
|
|
| |
| uni2buck[u"\ufefb"] = "lA" |
| uni2buck[u"\ufef7"] = "l>" |
| uni2buck[u"\ufef5"] = "l|" |
| uni2buck[u"\ufef9"] = "l<" |
|
|
| |
| def clean_text(text): |
| text = re.sub(u"[\ufeff]", "", text, flags=re.UNICODE) |
| text = utils.remove_non_arabic(text) |
| text = utils.strip_tashkeel(text) |
| text = utils.strip_tatweel(text) |
| return text |
|
|
| |
| def transliterate_word(input_word, direction='bw2ar'): |
| output_word = '' |
| |
| for char in input_word: |
| |
| |
| |
| |
| |
| |
| if direction == 'bw2ar': |
| |
| output_word += buck2uni.get(char, char) |
| elif direction == 'ar2bw': |
| |
| output_word += uni2buck.get(char, char) |
| else: |
| sys.stderr.write('Error: invalid direction!') |
| sys.exit() |
| return output_word |
|
|
|
|
| |
| def transliterate_text(input_text, direction='bw2ar'): |
| output_text = '' |
| for input_word in input_text.split(' '): |
| output_text += transliterate_word(input_word, direction) + ' ' |
|
|
| return output_text[:-1] |
|
|
|
|
| if __name__ == '__main__': |
| if len(sys.argv) < 2: |
| sys.stderr.write('Usage: INPUT TEXT | python {} DIRECTION(bw2ar|ar2bw)'.format(sys.argv[1])) |
| exit(1) |
| for line in sys.stdin: |
| line = line if sys.argv[1] == 'bw2ar' else clean_text(line) |
| output_text = transliterate_text(line, direction=str(sys.argv[1])) |
| if output_text.strip() != '': |
| sys.stdout.write('{}\n'.format(output_text.strip())) |
|
|
|
|
|
|
|
|