mosaicml · Landanjs · Oct 10, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023
diff --git a/diffusion/datasets/synthetic_image_caption.py b/diffusion/datasets/synthetic_image_caption.py
@@ -16,28 +16,35 @@ class SyntheticImageCaptionDataset(Dataset):
     Args:
         image_size (int): Size of the synthetic images. Default: ``512``.
         caption_length (int): Length of the synthetic captions. Default: ``77``.
-        num_samples (int): Number of samples in the synthetic dataset. Default: ``100_000``.
+        sdxl (bool): Whether or not to generate synthetic data for SDXL. Default: ``False``.
     """
 
-    def __init__(self, image_size: int = 512, caption_length: int = 77, num_samples: int = 100_000):
+    def __init__(self, image_size: int = 512, caption_length: int = 77, sdxl: bool = False):
 
         super().__init__()
-        self.num_samples = num_samples
-        self.images = torch.randn(num_samples, 3, image_size, image_size)
-        self.captions = torch.randint(0, 128, (num_samples, caption_length), dtype=torch.long)
+        self.image_size = image_size
+        self.sdxl = sdxl
+        self.caption_shape = (2, caption_length) if self.sdxl else (caption_length,)
 
     def __len__(self):
-        return len(self.images)
+        return 100_000
 
     def __getitem__(self, idx):
-        return {'image': self.images[idx], 'captions': self.captions[idx]}
+        out = {}
+        if self.sdxl:
+            out['cond_crops_coords_top_left'] = torch.tensor([0, 0], dtype=torch.float)
+            out['cond_original_size'] = torch.tensor([self.image_size, self.image_size], dtype=torch.float)
+            out['cond_target_size'] = torch.tensor([self.image_size, self.image_size], dtype=torch.float)
+        out['image'] = torch.randn(3, self.image_size, self.image_size)
+        out['captions'] = torch.randint(0, 128, self.caption_shape, dtype=torch.long)
+        return out
 
 
 def build_synthetic_image_caption_dataloader(
     batch_size: int,
     image_size: int = 512,
     caption_length: int = 77,
-    num_samples: int = 100_000,
+    sdxl: bool = False,
     dataloader_kwargs: Optional[Dict] = None,
 ):
     """Builds a dataloader for the synthetic image-caption dataset.
@@ -46,7 +53,7 @@ def build_synthetic_image_caption_dataloader(
         batch_size (int): Batch size for the dataloader.
         image_size (int): Size of the synthetic images. Default: ``512``.
         caption_length (int): Length of the synthetic captions. Default: ``77``.
-        num_samples (int): Number of samples in the synthetic dataset. Default: ``100_000``.
+        sdxl (bool): Whether or not to generate synthetic data for SDXL. Default: ``False``.
         dataloader_kwargs (optional, dict): Additional arguments to pass to the dataloader. Default ``None``.
     """
     if dataloader_kwargs is None:
@@ -55,7 +62,7 @@ def build_synthetic_image_caption_dataloader(
     dataset = SyntheticImageCaptionDataset(
         image_size=image_size,
         caption_length=caption_length,
-        num_samples=num_samples,
+        sdxl=sdxl,
     )
 
     dataloader = DataLoader(