ltmia-code-mia-classifier / manifest_train.yaml
davidi123's picture
Upload manifest_train.yaml with huggingface_hub
e799299 verified
combinations:
cerebras_cerebras-gpt-2.7b_tokyotech_llm_swallow_code:
model_name: cerebras/Cerebras-GPT-2.7B
dataset_name: tokyotech-llm/swallow-code
path: features/train/cerebras_cerebras-gpt-2.7b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
cerebras_cerebras-gpt-2.7b_codeparrot_codeparrot_clean:
model_name: cerebras/Cerebras-GPT-2.7B
dataset_name: codeparrot/codeparrot-clean
path: features/train/cerebras_cerebras-gpt-2.7b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
cerebras_cerebras-gpt-2.7b_jetbrains_kstack:
model_name: cerebras/Cerebras-GPT-2.7B
dataset_name: JetBrains/KStack
path: features/train/cerebras_cerebras-gpt-2.7b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
cerebras_cerebras-gpt-2.7b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: cerebras/Cerebras-GPT-2.7B
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/cerebras_cerebras-gpt-2.7b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
cerebras_cerebras-gpt-2.7b_nan_do_code_search_net_java:
model_name: cerebras/Cerebras-GPT-2.7B
dataset_name: Nan-Do/code-search-net-java
path: features/train/cerebras_cerebras-gpt-2.7b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
cerebras_cerebras-gpt-2.7b_bigcode_the_stack_smol_xl:
model_name: cerebras/Cerebras-GPT-2.7B
dataset_name: bigcode/the-stack-smol-xl
path: features/train/cerebras_cerebras-gpt-2.7b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3200
val_size: 400
test_size: 400
eleutherai_pythia-1.4b_tokyotech_llm_swallow_code:
model_name: EleutherAI/pythia-1.4b
dataset_name: tokyotech-llm/swallow-code
path: features/train/eleutherai_pythia-1.4b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
eleutherai_pythia-1.4b_codeparrot_codeparrot_clean:
model_name: EleutherAI/pythia-1.4b
dataset_name: codeparrot/codeparrot-clean
path: features/train/eleutherai_pythia-1.4b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
eleutherai_pythia-1.4b_jetbrains_kstack:
model_name: EleutherAI/pythia-1.4b
dataset_name: JetBrains/KStack
path: features/train/eleutherai_pythia-1.4b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
eleutherai_pythia-1.4b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: EleutherAI/pythia-1.4b
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/eleutherai_pythia-1.4b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
eleutherai_pythia-1.4b_nan_do_code_search_net_java:
model_name: EleutherAI/pythia-1.4b
dataset_name: Nan-Do/code-search-net-java
path: features/train/eleutherai_pythia-1.4b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
eleutherai_pythia-1.4b_bigcode_the_stack_smol_xl:
model_name: EleutherAI/pythia-1.4b
dataset_name: bigcode/the-stack-smol-xl
path: features/train/eleutherai_pythia-1.4b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3600
val_size: 200
test_size: 200
eleutherai_gpt-j-6b_tokyotech_llm_swallow_code:
model_name: EleutherAI/gpt-j-6b
dataset_name: tokyotech-llm/swallow-code
path: features/train/eleutherai_gpt-j-6b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
eleutherai_gpt-j-6b_codeparrot_codeparrot_clean:
model_name: EleutherAI/gpt-j-6b
dataset_name: codeparrot/codeparrot-clean
path: features/train/eleutherai_gpt-j-6b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
eleutherai_gpt-j-6b_jetbrains_kstack:
model_name: EleutherAI/gpt-j-6b
dataset_name: JetBrains/KStack
path: features/train/eleutherai_gpt-j-6b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
eleutherai_gpt-j-6b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: EleutherAI/gpt-j-6b
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/eleutherai_gpt-j-6b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
eleutherai_gpt-j-6b_nan_do_code_search_net_java:
model_name: EleutherAI/gpt-j-6b
dataset_name: Nan-Do/code-search-net-java
path: features/train/eleutherai_gpt-j-6b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
eleutherai_gpt-j-6b_bigcode_the_stack_smol_xl:
model_name: EleutherAI/gpt-j-6b
dataset_name: bigcode/the-stack-smol-xl
path: features/train/eleutherai_gpt-j-6b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3600
val_size: 200
test_size: 200
google_gemma-2b_tokyotech_llm_swallow_code:
model_name: google/gemma-2b
dataset_name: tokyotech-llm/swallow-code
path: features/train/google_gemma-2b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
google_gemma-2b_codeparrot_codeparrot_clean:
model_name: google/gemma-2b
dataset_name: codeparrot/codeparrot-clean
path: features/train/google_gemma-2b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
google_gemma-2b_jetbrains_kstack:
model_name: google/gemma-2b
dataset_name: JetBrains/KStack
path: features/train/google_gemma-2b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
google_gemma-2b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: google/gemma-2b
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/google_gemma-2b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
google_gemma-2b_nan_do_code_search_net_java:
model_name: google/gemma-2b
dataset_name: Nan-Do/code-search-net-java
path: features/train/google_gemma-2b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
google_gemma-2b_bigcode_the_stack_smol_xl:
model_name: google/gemma-2b
dataset_name: bigcode/the-stack-smol-xl
path: features/train/google_gemma-2b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3600
val_size: 200
test_size: 200
qwen_qwen2-1.5b_tokyotech_llm_swallow_code:
model_name: Qwen/Qwen2-1.5B
dataset_name: tokyotech-llm/swallow-code
path: features/train/qwen_qwen2-1.5b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
qwen_qwen2-1.5b_codeparrot_codeparrot_clean:
model_name: Qwen/Qwen2-1.5B
dataset_name: codeparrot/codeparrot-clean
path: features/train/qwen_qwen2-1.5b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
qwen_qwen2-1.5b_jetbrains_kstack:
model_name: Qwen/Qwen2-1.5B
dataset_name: JetBrains/KStack
path: features/train/qwen_qwen2-1.5b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
qwen_qwen2-1.5b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: Qwen/Qwen2-1.5B
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/qwen_qwen2-1.5b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
qwen_qwen2-1.5b_nan_do_code_search_net_java:
model_name: Qwen/Qwen2-1.5B
dataset_name: Nan-Do/code-search-net-java
path: features/train/qwen_qwen2-1.5b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
qwen_qwen2-1.5b_bigcode_the_stack_smol_xl:
model_name: Qwen/Qwen2-1.5B
dataset_name: bigcode/the-stack-smol-xl
path: features/train/qwen_qwen2-1.5b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3600
val_size: 200
test_size: 200
tiiuae_falcon-rw-1b_tokyotech_llm_swallow_code:
model_name: tiiuae/falcon-rw-1b
dataset_name: tokyotech-llm/swallow-code
path: features/train/tiiuae_falcon-rw-1b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-rw-1b_codeparrot_codeparrot_clean:
model_name: tiiuae/falcon-rw-1b
dataset_name: codeparrot/codeparrot-clean
path: features/train/tiiuae_falcon-rw-1b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-rw-1b_jetbrains_kstack:
model_name: tiiuae/falcon-rw-1b
dataset_name: JetBrains/KStack
path: features/train/tiiuae_falcon-rw-1b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-rw-1b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: tiiuae/falcon-rw-1b
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/tiiuae_falcon-rw-1b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-rw-1b_nan_do_code_search_net_java:
model_name: tiiuae/falcon-rw-1b
dataset_name: Nan-Do/code-search-net-java
path: features/train/tiiuae_falcon-rw-1b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-rw-1b_bigcode_the_stack_smol_xl:
model_name: tiiuae/falcon-rw-1b
dataset_name: bigcode/the-stack-smol-xl
path: features/train/tiiuae_falcon-rw-1b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3600
val_size: 200
test_size: 200
distilgpt2_bigcode_the_stack_smol_xl:
model_name: distilgpt2
dataset_name: bigcode/the-stack-smol-xl
path: features/train/distilgpt2_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3200
val_size: 400
test_size: 400
distilgpt2_codeparrot_codeparrot_clean:
model_name: distilgpt2
dataset_name: codeparrot/codeparrot-clean
path: features/train/distilgpt2_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
distilgpt2_jetbrains_kstack:
model_name: distilgpt2
dataset_name: JetBrains/KStack
path: features/train/distilgpt2_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
distilgpt2_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: distilgpt2
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/distilgpt2_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
distilgpt2_nan_do_code_search_net_java:
model_name: distilgpt2
dataset_name: Nan-Do/code-search-net-java
path: features/train/distilgpt2_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
distilgpt2_tokyotech_llm_swallow_code:
model_name: distilgpt2
dataset_name: tokyotech-llm/swallow-code
path: features/train/distilgpt2_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
gpt2-xl_bigcode_the_stack_smol_xl:
model_name: gpt2-xl
dataset_name: bigcode/the-stack-smol-xl
path: features/train/gpt2-xl_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3200
val_size: 400
test_size: 400
gpt2-xl_codeparrot_codeparrot_clean:
model_name: gpt2-xl
dataset_name: codeparrot/codeparrot-clean
path: features/train/gpt2-xl_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
gpt2-xl_jetbrains_kstack:
model_name: gpt2-xl
dataset_name: JetBrains/KStack
path: features/train/gpt2-xl_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
gpt2-xl_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: gpt2-xl
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/gpt2-xl_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
gpt2-xl_nan_do_code_search_net_java:
model_name: gpt2-xl
dataset_name: Nan-Do/code-search-net-java
path: features/train/gpt2-xl_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
gpt2-xl_tokyotech_llm_swallow_code:
model_name: gpt2-xl
dataset_name: tokyotech-llm/swallow-code
path: features/train/gpt2-xl_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 8000
val_size: 1000
test_size: 1000
tiiuae_falcon-7b_tokyotech_llm_swallow_code:
model_name: tiiuae/falcon-7b
dataset_name: tokyotech-llm/swallow-code
path: features/train/tiiuae_falcon-7b_tokyotech_llm_swallow_code
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-7b_codeparrot_codeparrot_clean:
model_name: tiiuae/falcon-7b
dataset_name: codeparrot/codeparrot-clean
path: features/train/tiiuae_falcon-7b_codeparrot_codeparrot_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-7b_jetbrains_kstack:
model_name: tiiuae/falcon-7b
dataset_name: JetBrains/KStack
path: features/train/tiiuae_falcon-7b_jetbrains_kstack
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-7b_mvasiliniuc_iva_kotlin_codeint_clean:
model_name: tiiuae/falcon-7b
dataset_name: mvasiliniuc/iva-kotlin-codeint-clean
path: features/train/tiiuae_falcon-7b_mvasiliniuc_iva_kotlin_codeint_clean
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-7b_nan_do_code_search_net_java:
model_name: tiiuae/falcon-7b
dataset_name: Nan-Do/code-search-net-java
path: features/train/tiiuae_falcon-7b_nan_do_code_search_net_java
n_members: 10000
n_nonmembers: 10000
feature_dim: 154
sequence_length: 128
train_size: 9000
val_size: 500
test_size: 500
tiiuae_falcon-7b_bigcode_the_stack_smol_xl:
model_name: tiiuae/falcon-7b
dataset_name: bigcode/the-stack-smol-xl
path: features/train/tiiuae_falcon-7b_bigcode_the_stack_smol_xl
n_members: 4000
n_nonmembers: 4000
feature_dim: 154
sequence_length: 128
train_size: 3600
val_size: 200
test_size: 200