Spaces:
Runtime error
Runtime error
Commit
·
52aac07
1
Parent(s):
1d1af69
mention our friends
Browse files- app.py +1 -3
- mem_calc.py +1 -1
- models.py +48 -25
app.py
CHANGED
|
@@ -32,10 +32,8 @@ with st.expander("More options"):
|
|
| 32 |
precisions_values = ('O0', 'O1', 'O3')
|
| 33 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
| 34 |
|
| 35 |
-
vocab_size = int(st.number_input('Vocabulary size', min_value=1, step=1, value=50257, format="%i"))
|
| 36 |
-
|
| 37 |
args = mem_calc.parse_args(f"""
|
| 38 |
-
--model {model} --
|
| 39 |
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''} {'--albert' if share_params else ''}
|
| 40 |
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size} --seqlen {seq_len}
|
| 41 |
""".split())
|
|
|
|
| 32 |
precisions_values = ('O0', 'O1', 'O3')
|
| 33 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
| 34 |
|
|
|
|
|
|
|
| 35 |
args = mem_calc.parse_args(f"""
|
| 36 |
+
--model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
|
| 37 |
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''} {'--albert' if share_params else ''}
|
| 38 |
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size} --seqlen {seq_len}
|
| 39 |
""".split())
|
mem_calc.py
CHANGED
|
@@ -123,7 +123,7 @@ def parse_args(args=None):
|
|
| 123 |
help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.')
|
| 124 |
parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models')
|
| 125 |
parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.')
|
| 126 |
-
parser.add_argument('--vocab_size', type=int, default=
|
| 127 |
parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.')
|
| 128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
| 129 |
parser.add_argument('--zero', type=int, default=0,
|
|
|
|
| 123 |
help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.')
|
| 124 |
parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models')
|
| 125 |
parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.')
|
| 126 |
+
parser.add_argument('--vocab_size', type=int, default=None, help='The vocabulary to use.')
|
| 127 |
parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.')
|
| 128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
| 129 |
parser.add_argument('--zero', type=int, default=0,
|
models.py
CHANGED
|
@@ -1,97 +1,120 @@
|
|
| 1 |
models = {}
|
| 2 |
-
models['bert-
|
| 3 |
-
models['bert-
|
| 4 |
-
models['bert-
|
| 5 |
-
models['bert-
|
| 6 |
-
models['bert-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
models['bert-
|
| 11 |
-
models['bert-
|
| 12 |
-
models['bert-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
models['t5-3b'] = {}
|
| 15 |
models['t5-3b']['seqlen'] = 512
|
| 16 |
models['t5-3b']['dmodel'] = 1024
|
| 17 |
-
models['t5-3b']['
|
| 18 |
models['t5-3b']['nlayers'] = 48
|
|
|
|
| 19 |
|
| 20 |
models['t5-11b'] = {}
|
| 21 |
models['t5-11b']['seqlen'] = 512
|
| 22 |
models['t5-11b']['dmodel'] = 1024
|
| 23 |
-
models['t5-11b']['
|
| 24 |
models['t5-11b']['nlayers'] = 48
|
|
|
|
| 25 |
|
| 26 |
models['gpt2-s'] = {}
|
| 27 |
models['gpt2-s']['seqlen'] = 1024
|
| 28 |
models['gpt2-s']['dmodel'] = 768
|
| 29 |
-
models['gpt2-s']['
|
| 30 |
models['gpt2-s']['nlayers'] = 12
|
|
|
|
| 31 |
|
| 32 |
models['gpt2-m'] = {}
|
| 33 |
models['gpt2-m']['seqlen'] = 1024
|
| 34 |
models['gpt2-m']['dmodel'] = 1024
|
| 35 |
-
models['gpt2-m']['
|
| 36 |
models['gpt2-m']['nlayers'] = 24
|
|
|
|
| 37 |
|
| 38 |
models['gpt2-l'] = {}
|
| 39 |
models['gpt2-l']['seqlen'] = 1024
|
| 40 |
models['gpt2-l']['dmodel'] = 1280
|
| 41 |
-
models['gpt2-l']['
|
| 42 |
models['gpt2-l']['nlayers'] = 36
|
|
|
|
| 43 |
|
| 44 |
models['gpt2-xl'] = {}
|
| 45 |
models['gpt2-xl']['seqlen'] = 1024
|
| 46 |
models['gpt2-xl']['dmodel'] = 1600
|
| 47 |
-
models['gpt2-xl']['
|
| 48 |
models['gpt2-xl']['nlayers'] = 48
|
|
|
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
models['gpt3-s'] = {}
|
| 52 |
models['gpt3-s']['seqlen'] = 2048
|
| 53 |
models['gpt3-s']['dmodel'] = 768
|
| 54 |
-
models['gpt3-s']['
|
| 55 |
models['gpt3-s']['nlayers'] = 12
|
|
|
|
| 56 |
|
| 57 |
models['gpt3-m'] = {}
|
| 58 |
models['gpt3-m']['seqlen'] = 2048
|
| 59 |
models['gpt3-m']['dmodel'] = 1024
|
| 60 |
-
models['gpt3-m']['
|
| 61 |
models['gpt3-m']['nlayers'] = 24
|
|
|
|
| 62 |
|
| 63 |
models['gpt3-l'] = {}
|
| 64 |
models['gpt3-l']['seqlen'] = 2048
|
| 65 |
models['gpt3-l']['dmodel'] = 1536
|
| 66 |
-
models['gpt3-l']['
|
| 67 |
models['gpt3-l']['nlayers'] = 24
|
|
|
|
| 68 |
|
| 69 |
models['gpt3-xl'] = {}
|
| 70 |
models['gpt3-xl']['seqlen'] = 2048
|
| 71 |
models['gpt3-xl']['dmodel'] = 2560
|
| 72 |
-
models['gpt3-xl']['
|
| 73 |
models['gpt3-xl']['nlayers'] = 24
|
|
|
|
| 74 |
|
| 75 |
models['gpt3-3b'] = {}
|
| 76 |
models['gpt3-3b']['seqlen'] = 2048
|
| 77 |
models['gpt3-3b']['dmodel'] = 2560
|
| 78 |
-
models['gpt3-3b']['
|
| 79 |
models['gpt3-3b']['nlayers'] = 32
|
|
|
|
| 80 |
|
| 81 |
models['gpt3-7b'] = {}
|
| 82 |
models['gpt3-7b']['seqlen'] = 2048
|
| 83 |
models['gpt3-7b']['dmodel'] = 4096
|
| 84 |
-
models['gpt3-7b']['
|
| 85 |
models['gpt3-7b']['nlayers'] = 32
|
|
|
|
| 86 |
|
| 87 |
models['gpt3-13b'] = {}
|
| 88 |
models['gpt3-13b']['seqlen'] = 2048
|
| 89 |
models['gpt3-13b']['dmodel'] = 5120
|
| 90 |
-
models['gpt3-13b']['
|
| 91 |
models['gpt3-13b']['nlayers'] = 40
|
|
|
|
| 92 |
|
| 93 |
models['gpt3-175b'] = {}
|
| 94 |
models['gpt3-175b']['seqlen'] = 2048
|
| 95 |
models['gpt3-175b']['dmodel'] = 12288
|
| 96 |
-
models['gpt3-175b']['
|
| 97 |
models['gpt3-175b']['nlayers'] = 96
|
|
|
|
|
|
| 1 |
models = {}
|
| 2 |
+
models['bert-base'] = {}
|
| 3 |
+
models['bert-base']['seqlen'] = 512
|
| 4 |
+
models['bert-base']['dmodel'] = 768
|
| 5 |
+
models['bert-base']['dhid'] = 3072
|
| 6 |
+
models['bert-base']['nlayers'] = 12
|
| 7 |
+
models['bert-base']['vocab_size'] = 30522
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
models['bert-large'] = {}
|
| 11 |
+
models['bert-large']['seqlen'] = 512
|
| 12 |
+
models['bert-large']['dmodel'] = 1024
|
| 13 |
+
models['bert-large']['dhid'] = 4096
|
| 14 |
+
models['bert-large']['nlayers'] = 24
|
| 15 |
+
models['bert-large']['vocab_size'] = 30522
|
| 16 |
|
| 17 |
models['t5-3b'] = {}
|
| 18 |
models['t5-3b']['seqlen'] = 512
|
| 19 |
models['t5-3b']['dmodel'] = 1024
|
| 20 |
+
models['t5-3b']['dhid'] = 16384
|
| 21 |
models['t5-3b']['nlayers'] = 48
|
| 22 |
+
models['t5-3b']['vocab_size'] = 32128
|
| 23 |
|
| 24 |
models['t5-11b'] = {}
|
| 25 |
models['t5-11b']['seqlen'] = 512
|
| 26 |
models['t5-11b']['dmodel'] = 1024
|
| 27 |
+
models['t5-11b']['dhid'] = 64*1024
|
| 28 |
models['t5-11b']['nlayers'] = 48
|
| 29 |
+
models['t5-11b']['vocab_size'] = 32128
|
| 30 |
|
| 31 |
models['gpt2-s'] = {}
|
| 32 |
models['gpt2-s']['seqlen'] = 1024
|
| 33 |
models['gpt2-s']['dmodel'] = 768
|
| 34 |
+
models['gpt2-s']['dhid'] = 768*4
|
| 35 |
models['gpt2-s']['nlayers'] = 12
|
| 36 |
+
models['gpt2-s']['vocab_size'] = 50257
|
| 37 |
|
| 38 |
models['gpt2-m'] = {}
|
| 39 |
models['gpt2-m']['seqlen'] = 1024
|
| 40 |
models['gpt2-m']['dmodel'] = 1024
|
| 41 |
+
models['gpt2-m']['dhid'] = 1024*4
|
| 42 |
models['gpt2-m']['nlayers'] = 24
|
| 43 |
+
models['gpt2-m']['vocab_size'] = 50257
|
| 44 |
|
| 45 |
models['gpt2-l'] = {}
|
| 46 |
models['gpt2-l']['seqlen'] = 1024
|
| 47 |
models['gpt2-l']['dmodel'] = 1280
|
| 48 |
+
models['gpt2-l']['dhid'] = 1280*4
|
| 49 |
models['gpt2-l']['nlayers'] = 36
|
| 50 |
+
models['gpt2-l']['vocab_size'] = 50257
|
| 51 |
|
| 52 |
models['gpt2-xl'] = {}
|
| 53 |
models['gpt2-xl']['seqlen'] = 1024
|
| 54 |
models['gpt2-xl']['dmodel'] = 1600
|
| 55 |
+
models['gpt2-xl']['dhid'] = 1600*4
|
| 56 |
models['gpt2-xl']['nlayers'] = 48
|
| 57 |
+
models['gpt2-xl']['vocab_size'] = 50257
|
| 58 |
|
| 59 |
+
models['gpt-j-6b'] = {}
|
| 60 |
+
models['gpt-j-6b']['seqlen'] = 2048
|
| 61 |
+
models['gpt-j-6b']['dmodel'] = 4096
|
| 62 |
+
models['gpt-j-6b']['dhid'] = 4096 * 4
|
| 63 |
+
models['gpt-j-6b']['nlayers'] = 28
|
| 64 |
+
models['gpt-j-6b']['vocab_size'] = 50400
|
| 65 |
|
| 66 |
models['gpt3-s'] = {}
|
| 67 |
models['gpt3-s']['seqlen'] = 2048
|
| 68 |
models['gpt3-s']['dmodel'] = 768
|
| 69 |
+
models['gpt3-s']['dhid'] = 768*4
|
| 70 |
models['gpt3-s']['nlayers'] = 12
|
| 71 |
+
models['gpt3-s']['vocab_size'] = 50257 # from public reimplementations
|
| 72 |
|
| 73 |
models['gpt3-m'] = {}
|
| 74 |
models['gpt3-m']['seqlen'] = 2048
|
| 75 |
models['gpt3-m']['dmodel'] = 1024
|
| 76 |
+
models['gpt3-m']['dhid'] = 1024*4
|
| 77 |
models['gpt3-m']['nlayers'] = 24
|
| 78 |
+
models['gpt3-m']['vocab_size'] = 50257 # from public reimplementations
|
| 79 |
|
| 80 |
models['gpt3-l'] = {}
|
| 81 |
models['gpt3-l']['seqlen'] = 2048
|
| 82 |
models['gpt3-l']['dmodel'] = 1536
|
| 83 |
+
models['gpt3-l']['dhid'] = 1536*4
|
| 84 |
models['gpt3-l']['nlayers'] = 24
|
| 85 |
+
models['gpt3-l']['vocab_size'] = 50257 # from public reimplementations
|
| 86 |
|
| 87 |
models['gpt3-xl'] = {}
|
| 88 |
models['gpt3-xl']['seqlen'] = 2048
|
| 89 |
models['gpt3-xl']['dmodel'] = 2560
|
| 90 |
+
models['gpt3-xl']['dhid'] = 2560*4
|
| 91 |
models['gpt3-xl']['nlayers'] = 24
|
| 92 |
+
models['gpt3-xl']['vocab_size'] = 50257 # from public reimplementations
|
| 93 |
|
| 94 |
models['gpt3-3b'] = {}
|
| 95 |
models['gpt3-3b']['seqlen'] = 2048
|
| 96 |
models['gpt3-3b']['dmodel'] = 2560
|
| 97 |
+
models['gpt3-3b']['dhid'] = 2560*4
|
| 98 |
models['gpt3-3b']['nlayers'] = 32
|
| 99 |
+
models['gpt3-3b']['vocab_size'] = 50257 # from public reimplementations
|
| 100 |
|
| 101 |
models['gpt3-7b'] = {}
|
| 102 |
models['gpt3-7b']['seqlen'] = 2048
|
| 103 |
models['gpt3-7b']['dmodel'] = 4096
|
| 104 |
+
models['gpt3-7b']['dhid'] = 4096*4
|
| 105 |
models['gpt3-7b']['nlayers'] = 32
|
| 106 |
+
models['gpt3-7b']['vocab_size'] = 50257 # from public reimplementations
|
| 107 |
|
| 108 |
models['gpt3-13b'] = {}
|
| 109 |
models['gpt3-13b']['seqlen'] = 2048
|
| 110 |
models['gpt3-13b']['dmodel'] = 5120
|
| 111 |
+
models['gpt3-13b']['dhid'] = 5120*4
|
| 112 |
models['gpt3-13b']['nlayers'] = 40
|
| 113 |
+
models['gpt3-13b']['vocab_size'] = 50257 # from public reimplementations
|
| 114 |
|
| 115 |
models['gpt3-175b'] = {}
|
| 116 |
models['gpt3-175b']['seqlen'] = 2048
|
| 117 |
models['gpt3-175b']['dmodel'] = 12288
|
| 118 |
+
models['gpt3-175b']['dhid'] = 12288*4
|
| 119 |
models['gpt3-175b']['nlayers'] = 96
|
| 120 |
+
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
|