bigscience-workshop · Muennighoff · May 22, 2022 · Jul 3, 2022 · Jul 3, 2022 · Jul 8, 2022
diff --git a/promptsource/machine_translate.py b/promptsource/machine_translate.py
@@ -0,0 +1,176 @@
+import os
+import re
+
+from promptsource.templates import Template, TemplateCollection
+
+
+### XNLI
+
+PROMPTS = [
+    "GPT-3 style",
+    "can we infer",
+    "justified in saying",
+    "guaranteed/possible/impossible",
+    "MNLI crowdsource",
+]
+
+LANGS = [
+    "ar",
+    "es",
+    "fr",
+    "hi",
+    "sw",
+    "ur",
+    "vi",
+    "zh",
+]
+
+SOURCE_DATASET = TARGET_DATASET = "xnli"
+SOURCE_LANG = "en"
+
+
+### XCOPA
+
+PROMPTS = [
+    "best_option",
+    'C1 or C2? premise, so/because…',
+    "i_am_hesitating",
+    "cause_effect",
+    "plausible_alternatives",
+]
+
+LANGS = [
+    "id",
+    "sw",
+    "ta",
+    "vi",
+    "zh",
+    "th",
+    "it",
+    "qu",
+]
+
+SOURCE_DATASET = "super_glue/copa"
+SOURCE_LANG = None
+TARGET_DATASET = "xcopa"
+
+### XSTORY_CLOZE
+
+PROMPTS = [
+    "Answer Given options",
+    'Choose Story Ending',
+    "Story Continuation and Options",
+    "Generate Ending",
+    "Novel Correct Ending",
+]
+
+LANGS = [
+    "ar",
+    "es",
+    "eu",
+    "hi",
+    "id",
+    "zh",
+    "my",
+    "ru",
+    "sw",
+    "te",
+]
+
+SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xstory_cloze"
+SOURCE_LANG = "en"
+
+### XWINOGRAD
+PROMPTS = [
+    "Replace",
+    "stand for",
+    "True or False",
+    "does underscore refer to",
+    "underscore refer to",
+]
+
+LANGS = [
+    "fr",
+    "pt",
+    "zh",
+    "ja",
+    "ru",
+]
+
+SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xwinograd"
+SOURCE_LANG = "en"
+
+
+# Path to key
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/niklasmuennighoff/Desktop/gcp_translate_key.json"
+
+
+def translate(target, text):
+    """Translates text into the target language.
+    Target must be an ISO 639-1 language code.
+    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
+    (pip install --upgrade google-api-python-client)
+    pip install google-cloud-translate
+    """
+    import six
+    from google.cloud import translate_v2 as translate
+
+    translate_client = translate.Client()
+    if isinstance(text, six.binary_type):
+        text = text.decode("utf-8")
+    # Text can also be a sequence of strings, in which case this method
+    # will return a sequence of results for each text.
+    # By default format_ is html, which would return &quot; instead of "
+    result = translate_client.translate(text, source_language="en", target_language=target, format_="text")
+    print("Text: {}".format(result["input"]))
+    print("Translation: {}".format(result["translatedText"]))
+    # If not providing source_language
+    # print("Detected source language: {}".format(result["detectedSourceLanguage"]))
+    return result["translatedText"]
+
+
+def normalize_string(zh_string, en_string):
+    """
+    This is not specific to zh just to give an example & help Codex understand it :-)
+    Replaces the content in brackets in zh_string with the content in brackets from en_string.
+    All else is left the same in zh_string.
+    Args:
+        zh_string: {{前提}} 问题：{{假设}} 对、错或两者都不是？ ||| {{ answer_choices[标签] }}
+        en_string: {{premise}} Question: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}
+    Returns:
+        zh_string_normalized: {{premise}} 问题：{{hypothesis}} 对、错或两者都不是？ ||| {{ answer_choices[label] }}
+    """
+    zh_string_normalized = zh_string
+    # Find all the content in brackets in zh_string
+    # For only double brackets {{(.*?)}}, but we do single brackets as well
+    zh_bracket_content = re.findall(r"{(.*?)}", zh_string)
+    # Find all the content in brackets in en_string
+    # For only double brackets {{(.*?)}}, but we do single brackets as well
+    en_bracket_content = re.findall(r"{(.*?)}", en_string)
+    # Replace the content in brackets in zh_string with the content in brackets from en_string
+    for i in range(len(zh_bracket_content)):
+        zh_string_normalized = zh_string_normalized.replace(zh_bracket_content[i], en_bracket_content[i])
+    return zh_string_normalized
+
+
+template_collection = TemplateCollection()
+source_templates = template_collection.get_dataset(SOURCE_DATASET, SOURCE_LANG)
+
+for lang in LANGS:
+    target_templates = template_collection.get_dataset(TARGET_DATASET, lang)
+    for uid, template in source_templates.templates.items():
+        if template.name.strip() not in PROMPTS:
+            continue
+        print(f"Translating {template.name.strip()} to {lang}")
+        answer_choices = []
+        if template.answer_choices is not None:
+            choices = template.answer_choices.split("|||")
+            for c in choices:
+                answer_choices.append(normalize_string(translate(lang, c.strip()), c.strip()))
+        or_jinja = template.jinja.strip()
+        jinja = normalize_string(translate(lang, or_jinja), or_jinja)
+        template_name = template.name.strip() + f"_{lang}mt"
+        target_template = Template(
+            template_name, jinja=jinja, reference="", answer_choices=" ||| ".join(answer_choices)
+        )
+        target_templates.add_template(target_template)
diff --git a/promptsource/templates.py b/promptsource/templates.py
@@ -235,6 +235,212 @@
     "zu": "Zulu",
 }
 
+# These are the metrics with which templates can be tagged
+METRICS = {
+    "BLEU",
+    "ROUGE",
+    "Squad",
+    "Trivia QA",
+    "Accuracy",
+    "Pearson Correlation",
+    "Spearman Correlation",
+    "MultiRC",
+    "AUC",
+    "COQA F1",
+    "Edit Distance",
+    "Mean Reciprocal Rank",
+    "Other",
+}
+
+# These are the languages with which templates can be tagged. Keys are ISO 639-1
+# tags, which are the actual tags we use. Values are English names shown in the
+# UI for convenience.
+LANGUAGES = {
+    "ab": "Abkhazian",
+    "aa": "Afar",
+    "af": "Afrikaans",
+    "ak": "Akan",
+    "sq": "Albanian",
+    "am": "Amharic",
+    "ar": "Arabic",
+    "an": "Aragonese",
+    "hy": "Armenian",
+    "as": "Assamese",
+    "av": "Avaric",
+    "ae": "Avestan",
+    "ay": "Aymara",
+    "az": "Azerbaijani",
+    "bm": "Bambara",
+    "ba": "Bashkir",
+    "eu": "Basque",
+    "be": "Belarusian",
+    "bn": "Bengali",
+    "bi": "Bislama",
+    "bs": "Bosnian",
+    "br": "Breton",
+    "bg": "Bulgarian",
+    "my": "Burmese",
+    "ca": "Catalan, Valencian",
+    "ch": "Chamorro",
+    "ce": "Chechen",
+    "ny": "Chichewa, Chewa, Nyanja",
+    "zh": "Chinese",
+    "cu": "Church Slavic, Old Slavonic, Church Slavonic, Old Bulgarian, Old Church Slavonic",
+    "cv": "Chuvash",
+    "kw": "Cornish",
+    "co": "Corsican",
+    "cr": "Cree",
+    "hr": "Croatian",
+    "cs": "Czech",
+    "da": "Danish",
+    "dv": "Divehi, Dhivehi, Maldivian",
+    "nl": "Dutch, Flemish",
+    "dz": "Dzongkha",
+    "en": "English",
+    "eo": "Esperanto",
+    "et": "Estonian",
+    "ee": "Ewe",
+    "fo": "Faroese",
+    "fj": "Fijian",
+    "fi": "Finnish",
+    "fr": "French",
+    "fy": "Western Frisian",
+    "ff": "Fulah",
+    "gd": "Gaelic, Scottish Gaelic",
+    "gl": "Galician",
+    "lg": "Ganda",
+    "ka": "Georgian",
+    "de": "German",
+    "el": "Greek, Modern (1453–)",
+    "kl": "Kalaallisut, Greenlandic",
+    "gn": "Guarani",
+    "gu": "Gujarati",
+    "ht": "Haitian, Haitian Creole",
+    "ha": "Hausa",
+    "he": "Hebrew",
+    "hz": "Herero",
+    "hi": "Hindi",
+    "ho": "Hiri Motu",
+    "hu": "Hungarian",
+    "is": "Icelandic",
+    "io": "Ido",
+    "ig": "Igbo",
+    "id": "Indonesian",
+    "ia": "Interlingua (International Auxiliary Language Association)",
+    "ie": "Interlingue, Occidental",
+    "iu": "Inuktitut",
+    "ik": "Inupiaq",
+    "ga": "Irish",
+    "it": "Italian",
+    "ja": "Japanese",
+    "jv": "Javanese",
+    "kn": "Kannada",
+    "kr": "Kanuri",
+    "ks": "Kashmiri",
+    "kk": "Kazakh",
+    "km": "Central Khmer",
+    "ki": "Kikuyu, Gikuyu",
+    "rw": "Kinyarwanda",
+    "ky": "Kirghiz, Kyrgyz",
+    "kv": "Komi",
+    "kg": "Kongo",
+    "ko": "Korean",
+    "kj": "Kuanyama, Kwanyama",
+    "ku": "Kurdish",
+    "lo": "Lao",
+    "la": "Latin",
+    "lv": "Latvian",
+    "li": "Limburgan, Limburger, Limburgish",
+    "ln": "Lingala",
+    "lt": "Lithuanian",
+    "lu": "Luba-Katanga",
+    "lb": "Luxembourgish, Letzeburgesch",
+    "mk": "Macedonian",
+    "mg": "Malagasy",
+    "ms": "Malay",
+    "ml": "Malayalam",
+    "mt": "Maltese",
+    "gv": "Manx",
+    "mi": "Maori",
+    "mr": "Marathi",
+    "mh": "Marshallese",
+    "mn": "Mongolian",
+    "na": "Nauru",
+    "nv": "Navajo, Navaho",
+    "nd": "North Ndebele",
+    "nr": "South Ndebele",
+    "ng": "Ndonga",
+    "ne": "Nepali",
+    "no": "Norwegian",
+    "nb": "Norwegian Bokmål",
+    "nn": "Norwegian Nynorsk",
+    "ii": "Sichuan Yi, Nuosu",
+    "oc": "Occitan",
+    "oj": "Ojibwa",
+    "or": "Oriya",
+    "om": "Oromo",
+    "os": "Ossetian, Ossetic",
+    "pi": "Pali",
+    "ps": "Pashto, Pushto",
+    "fa": "Persian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "pa": "Punjabi, Panjabi",
+    "qu": "Quechua",
+    "ro": "Romanian, Moldavian, Moldovan",
+    "rm": "Romansh",
+    "rn": "Rundi",
+    "ru": "Russian",
+    "se": "Northern Sami",
+    "sm": "Samoan",
+    "sg": "Sango",
+    "sa": "Sanskrit",
+    "sc": "Sardinian",
+    "sr": "Serbian",
+    "sn": "Shona",
+    "sd": "Sindhi",
+    "si": "Sinhala, Sinhalese",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "so": "Somali",
+    "st": "Southern Sotho",
+    "es": "Spanish, Castilian",
+    "su": "Sundanese",
+    "sw": "Swahili",
+    "ss": "Swati",
+    "sv": "Swedish",
+    "tl": "Tagalog",
+    "ty": "Tahitian",
+    "tg": "Tajik",
+    "ta": "Tamil",
+    "tt": "Tatar",
+    "te": "Telugu",
+    "th": "Thai",
+    "bo": "Tibetan",
+    "ti": "Tigrinya",
+    "to": "Tonga (Tonga Islands)",
+    "ts": "Tsonga",
+    "tn": "Tswana",
+    "tr": "Turkish",
+    "tk": "Turkmen",
+    "tw": "Twi",
+    "ug": "Uighur, Uyghur",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "uz": "Uzbek",
+    "ve": "Venda",
+    "vi": "Vietnamese",
+    "vo": "Volapük",
+    "wa": "Walloon",
+    "cy": "Welsh",
+    "wo": "Wolof",
+    "xh": "Xhosa",
+    "yi": "Yiddish",
+    "yo": "Yoruba",
+    "za": "Zhuang, Chuang",
+    "zu": "Zulu",
+}
+
 
 def highlight(input):
     return "<span style='color: #F08080'>" + input + "</span>"
@@ -617,7 +823,7 @@ def write_to_file(self) -> None:
         # We only create the folder if a template is written
         if not os.path.exists(self.folder_path):
             os.makedirs(self.folder_path)
-        yaml.dump(self.format_for_dump(), open(self.yaml_path, "w"))
+        yaml.dump(self.format_for_dump(), open(self.yaml_path, "w"),allow_unicode=True)
 
     def add_template(self, template: "Template") -> None:
         """