diff --git a/moshi/moshi/modules/transformer.py b/moshi/moshi/modules/transformer.py index 553a2a8..b6a3622 100644 --- a/moshi/moshi/modules/transformer.py +++ b/moshi/moshi/modules/transformer.py @@ -265,8 +265,8 @@ def complete(self, k: torch.Tensor, v: torch.Tensor) -> KVCacheResult: B, H, T, D = k.shape indexes = torch.arange(T, device=self.end_offset.device, dtype=self.end_offset.dtype) + self.end_offset indexes = indexes % self.capacity - self.cache[0].index_copy_(2, indexes, k) - self.cache[1].index_copy_(2, indexes, v) + self.cache[0][:, :, indexes] = k + self.cache[1][:, :, indexes] = v self.end_offset.add_(T) keys = self.cache[0] diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 771f491..b7e4fdf 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -52,7 +52,7 @@ logger = setup_logger(__name__) -DeviceString = Literal["cuda"] | Literal["cpu"] #| Literal["mps"] +DeviceString = Literal["cuda"] | Literal["cpu"] | Literal["mps"] def torch_auto_device(requested: Optional[DeviceString] = None) -> torch.device: """Return a torch.device based on the requested string or availability.""" @@ -60,8 +60,8 @@ def torch_auto_device(requested: Optional[DeviceString] = None) -> torch.device: return torch.device(requested) if torch.cuda.is_available(): return torch.device("cuda") - #elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - # return torch.device("mps") + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return torch.device("mps") return torch.device("cpu") @@ -369,7 +369,7 @@ def main(): parser.add_argument("--hf-repo", type=str, default=loaders.DEFAULT_REPO, help="HF repo to look into, defaults PersonaPlex. " "Use this to select a different pre-trained model.") - parser.add_argument("--device", type=str, default="cuda", help="Device on which to run, defaults to 'cuda'.") + parser.add_argument("--device", type=str, default=None, help="Device on which to run, auto-detected if not set.") parser.add_argument("--cpu-offload", action="store_true", help="Offload LM model layers to CPU when GPU memory is insufficient. " "Requires 'accelerate' package.")