diff --git a/GPTQ-for-LLaMa/autograd_4bit.py b/GPTQ-for-LLaMa/autograd_4bit.py
index c55b7d3..c5c5489 100644
--- a/GPTQ-for-LLaMa/autograd_4bit.py
+++ b/GPTQ-for-LLaMa/autograd_4bit.py
@@ -15,6 +15,8 @@ auto_switch_thd = 16
 def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
     if shape_of_qweight not in buffer_mat_dic.keys():
         buffer_mat_dic[shape_of_qweight] = torch.zeros((shape_of_qweight[0] * 8, shape_of_qweight[1]), dtype=dtype, device=device)
+    elif buffer_mat_dic[shape_of_qweight].device != device:
+        buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(device)
     return buffer_mat_dic[shape_of_qweight]
     
 
@@ -217,8 +219,13 @@ def load_llama_model_4bit_low_ram(config_path, model_path, half=False):
             if name in layers:
                 del layers[name]
         make_quant_for_4bit_autograd(model, layers)
-    model = accelerate.load_checkpoint_and_dispatch(model=model, checkpoint=model_path, device_map='auto')
-    model.cuda()
+    model = accelerate.load_checkpoint_and_dispatch(
+        model=model,
+        checkpoint=model_path,
+        device_map='auto',
+        no_split_module_classes=["LlamaDecoderLayer"]
+    )
+
     model.seqlen = 2048
     
     if half:
diff --git a/finetune.py b/finetune.py
index 6224f2e..a259372 100644
--- a/finetune.py
+++ b/finetune.py
@@ -92,6 +92,11 @@ if not ft_config.skip:
         from gradient_checkpointing import apply_gradient_checkpointing
         apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio)
 
+    # Disable Trainer's DataParallel for multigpu
+    if torch.cuda.device_count() > 1:
+        model.is_parallelizable = True
+        model.model_parallel = True
+
     trainer = transformers.Trainer(
         model=model,
         train_dataset=data.train_data,