Spaces:

microsoft
/

TRELLIS.2

Running on Zero

App Files Files Community

JeffreyXiang commited on 10 days ago

Commit

388d03f

1 Parent(s): 917a889

update

Browse files

Files changed (5) hide show

.gitignore +0 -1
README.md +2 -2
requirements.txt +10 -5
trellis2/modules/image_feature_extractor.py +118 -0
trellis2/pipelines/trellis2_image_to_3d.py +25 -25

.gitignore CHANGED Viewed

@@ -19,7 +19,6 @@ lib64/
 parts/
 sdist/
 var/
-wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg

 parts/
 sdist/
 var/
 share/python-wheels/
 *.egg-info/
 .installed.cfg

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: TRELLIS.2
 emoji: 🏢
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
 sdk_version: 6.1.0
 app_file: app.py

 ---
 title: TRELLIS.2
 emoji: 🏢
+colorFrom: purple
+colorTo: red
 sdk: gradio
 sdk_version: 6.1.0
 app_file: app.py

requirements.txt CHANGED Viewed

@@ -13,8 +13,13 @@ trimesh==4.10.1
 transformers==4.46.3
 git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/cumesh-0.0.1-cp310-cp310-linux_x86_64.whl?download=true
-https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/flex_gemm-0.0.1-cp310-cp310-linux_x86_64.whl?download=true
-https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/o_voxel-0.0.1-cp310-cp310-linux_x86_64.whl?download=true
-https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/nvdiffrast-0.3.5-cp310-cp310-linux_x86_64?download=true
-https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/nvdiffrec_render-0.0.0-cp310-cp310-linux_x86_64.whl?download=true

 transformers==4.46.3
 git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+https://github.com/JeffreyXiang/Storages/releases/download/Space_Wheels_251210/cumesh-0.0.1-cp310-cp310-linux_x86_64.whl
+https://github.com/JeffreyXiang/Storages/releases/download/Space_Wheels_251210/flex_gemm-0.0.1-cp310-cp310-linux_x86_64.whl
+https://github.com/JeffreyXiang/Storages/releases/download/Space_Wheels_251210/o_voxel-0.0.1-cp310-cp310-linux_x86_64.whl
+https://github.com/JeffreyXiang/Storages/releases/download/Space_Wheels_251210/nvdiffrast-0.3.5-cp310-cp310-linux_x86_64.whl
+https://github.com/JeffreyXiang/Storages/releases/download/Space_Wheels_251210/nvdiffrec_render-0.0.0-cp310-cp310-linux_x86_64.whl
+# https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/cumesh-0.0.1-cp310-cp310-linux_x86_64.whl?download=true
+# https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/flex_gemm-0.0.1-cp310-cp310-linux_x86_64.whl?download=true
+# https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/o_voxel-0.0.1-cp310-cp310-linux_x86_64.whl?download=true
+# https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/nvdiffrast-0.3.5-cp310-cp310-linux_x86_64?download=true
+# https://huggingface.co/spaces/JeffreyXiang/TRELLIS.2/resolve/main/wheels/nvdiffrec_render-0.0.0-cp310-cp310-linux_x86_64.whl?download=true

trellis2/modules/image_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import *
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from transformers import DINOv3ViTModel
+import numpy as np
+from PIL import Image
+class DinoV2FeatureExtractor:
+    """
+    Feature extractor for DINOv2 models.
+    """
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.model = torch.hub.load('facebookresearch/dinov2', model_name, pretrained=True)
+        self.model.eval()
+        self.transform = transforms.Compose([
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+    def to(self, device):
+        self.model.to(device)
+    def cuda(self):
+        self.model.cuda()
+    def cpu(self):
+        self.model.cpu()
+    @torch.no_grad()
+    def __call__(self, image: Union[torch.Tensor, List[Image.Image]]) -> torch.Tensor:
+        """
+        Extract features from the image.
+        Args:
+            image: A batch of images as a tensor of shape (B, C, H, W) or a list of PIL images.
+        Returns:
+            A tensor of shape (B, N, D) where N is the number of patches and D is the feature dimension.
+        """
+        if isinstance(image, torch.Tensor):
+            assert image.ndim == 4, "Image tensor should be batched (B, C, H, W)"
+        elif isinstance(image, list):
+            assert all(isinstance(i, Image.Image) for i in image), "Image list should be list of PIL images"
+            image = [i.resize((518, 518), Image.LANCZOS) for i in image]
+            image = [np.array(i.convert('RGB')).astype(np.float32) / 255 for i in image]
+            image = [torch.from_numpy(i).permute(2, 0, 1).float() for i in image]
+            image = torch.stack(image).cuda()
+        else:
+            raise ValueError(f"Unsupported type of image: {type(image)}")
+        image = self.transform(image).cuda()
+        features = self.model(image, is_training=True)['x_prenorm']
+        patchtokens = F.layer_norm(features, features.shape[-1:])
+        return patchtokens
+class DinoV3FeatureExtractor:
+    """
+    Feature extractor for DINOv3 models.
+    """
+    def __init__(self, model_name: str, image_size=512):
+        self.model_name = model_name
+        self.model = DINOv3ViTModel.from_pretrained(model_name)
+        self.model.eval()
+        self.image_size = image_size
+        self.transform = transforms.Compose([
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+    def to(self, device):
+        self.model.to(device)
+    def cuda(self):
+        self.model.cuda()
+    def cpu(self):
+        self.model.cpu()
+    def extract_features(self, image: torch.Tensor) -> torch.Tensor:
+        image = image.to(self.model.embeddings.patch_embeddings.weight.dtype)
+        hidden_states = self.model.embeddings(image, bool_masked_pos=None)
+        position_embeddings = self.model.rope_embeddings(image)
+        for i, layer_module in enumerate(self.model.layer):
+            hidden_states = layer_module(
+                hidden_states,
+                position_embeddings=position_embeddings,
+            )
+        return F.layer_norm(hidden_states, hidden_states.shape[-1:])
+    @torch.no_grad()
+    def __call__(self, image: Union[torch.Tensor, List[Image.Image]]) -> torch.Tensor:
+        """
+        Extract features from the image.
+        Args:
+            image: A batch of images as a tensor of shape (B, C, H, W) or a list of PIL images.
+        Returns:
+            A tensor of shape (B, N, D) where N is the number of patches and D is the feature dimension.
+        """
+        if isinstance(image, torch.Tensor):
+            assert image.ndim == 4, "Image tensor should be batched (B, C, H, W)"
+        elif isinstance(image, list):
+            assert all(isinstance(i, Image.Image) for i in image), "Image list should be list of PIL images"
+            image = [i.resize((self.image_size, self.image_size), Image.LANCZOS) for i in image]
+            image = [np.array(i.convert('RGB')).astype(np.float32) / 255 for i in image]
+            image = [torch.from_numpy(i).permute(2, 0, 1).float() for i in image]
+            image = torch.stack(image).cuda()
+        else:
+            raise ValueError(f"Unsupported type of image: {type(image)}")
+        image = self.transform(image).cuda()
+        features = self.extract_features(image)
+        return features

trellis2/pipelines/trellis2_image_to_3d.py CHANGED Viewed

@@ -5,8 +5,8 @@ import numpy as np
 from PIL import Image
 from .base import Pipeline
 from . import samplers, rembg
-from .. import trainers
-from ..modules import sparse as sp
 from ..representations import Mesh, MeshWithVoxel
@@ -24,7 +24,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         tex_slat_sampler_params (dict): The parameters for the texture latent sampler.
         shape_slat_normalization (dict): The normalization parameters for the structured latent.
         tex_slat_normalization (dict): The normalization parameters for the texture latent.
-        image_cond_model (trainers.Trainer): The image conditioning model.
         rembg_model (Callable): The model for removing background.
         low_vram (bool): Whether to use low-VRAM mode.
     """
@@ -92,7 +92,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         new_pipeline.shape_slat_normalization = args['shape_slat_normalization']
         new_pipeline.tex_slat_normalization = args['tex_slat_normalization']
-        new_pipeline.image_cond_model = getattr(trainers, args['image_cond_model']['name'])(**args['image_cond_model']['args'])
         new_pipeline.rembg_model = getattr(rembg, args['rembg_model']['name'])(**args['rembg_model']['args'])
         new_pipeline.low_vram = args.get('low_vram', True)
@@ -230,7 +230,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         flow_model,
         coords: torch.Tensor,
         sampler_params: dict = {},
-    ) -> sp.SparseTensor:
         """
         Sample structured latent with the given conditioning.
@@ -240,7 +240,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample structured latent
-        noise = sp.SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
             coords=coords,
         )
@@ -275,7 +275,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         coords: torch.Tensor,
         sampler_params: dict = {},
         max_num_tokens: int = 49152,
-    ) -> sp.SparseTensor:
         """
         Sample structured latent with the given conditioning.
@@ -285,7 +285,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             sampler_params (dict): Additional parameters for the sampler.
         """
         # LR
-        noise = sp.SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model_lr.in_channels).to(self.device),
             coords=coords,
         )
@@ -329,7 +329,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             hr_resolution -= 128
         # Sample structured latent
-        noise = sp.SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
             coords=coords,
         )
@@ -355,19 +355,19 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     def decode_shape_slat(
         self,
-        slat: sp.SparseTensor,
         resolution: int,
-    ) -> Tuple[List[Mesh], List[sp.SparseTensor]]:
         """
         Decode the structured latent.
         Args:
-            slat (sp.SparseTensor): The structured latent.
             formats (List[str]): The formats to decode the structured latent to.
         Returns:
             List[Mesh]: The decoded meshes.
-            List[sp.SparseTensor]: The decoded substructures.
         """
         self.models['shape_slat_decoder'].set_resolution(resolution)
         if self.low_vram:
@@ -383,15 +383,15 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         self,
         cond: dict,
         flow_model,
-        shape_slat: sp.SparseTensor,
         sampler_params: dict = {},
-    ) -> sp.SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
-            shape_slat (sp.SparseTensor): The structured latent for shape
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample structured latent
@@ -424,18 +424,18 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     def decode_tex_slat(
         self,
-        slat: sp.SparseTensor,
-        subs: List[sp.SparseTensor],
-    ) -> sp.SparseTensor:
         """
         Decode the structured latent.
         Args:
-            slat (sp.SparseTensor): The structured latent.
             formats (List[str]): The formats to decode the structured latent to.
         Returns:
-            List[sp.SparseTensor]: The decoded texture voxels
         """
         if self.low_vram:
             self.models['tex_slat_decoder'].to(self.device)
@@ -447,16 +447,16 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     @torch.no_grad()
     def decode_latent(
         self,
-        shape_slat: sp.SparseTensor,
-        tex_slat: sp.SparseTensor,
         resolution: int,
     ) -> List[MeshWithVoxel]:
         """
         Decode the latent codes.
         Args:
-            shape_slat (sp.SparseTensor): The structured latent for shape.
-            tex_slat (sp.SparseTensor): The structured latent for texture.
             resolution (int): The resolution of the output.
         """
         meshes, subs = self.decode_shape_slat(shape_slat, resolution)

 from PIL import Image
 from .base import Pipeline
 from . import samplers, rembg
+from ..modules.sparse import SparseTensor
+from ..modules import image_feature_extractor
 from ..representations import Mesh, MeshWithVoxel
         tex_slat_sampler_params (dict): The parameters for the texture latent sampler.
         shape_slat_normalization (dict): The normalization parameters for the structured latent.
         tex_slat_normalization (dict): The normalization parameters for the texture latent.
+        image_cond_model (Callable): The image conditioning model.
         rembg_model (Callable): The model for removing background.
         low_vram (bool): Whether to use low-VRAM mode.
     """
         new_pipeline.shape_slat_normalization = args['shape_slat_normalization']
         new_pipeline.tex_slat_normalization = args['tex_slat_normalization']
+        new_pipeline.image_cond_model = getattr(image_feature_extractor, args['image_cond_model']['name'])(**args['image_cond_model']['args'])
         new_pipeline.rembg_model = getattr(rembg, args['rembg_model']['name'])(**args['rembg_model']['args'])
         new_pipeline.low_vram = args.get('low_vram', True)
         flow_model,
         coords: torch.Tensor,
         sampler_params: dict = {},
+    ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample structured latent
+        noise = SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
             coords=coords,
         )
         coords: torch.Tensor,
         sampler_params: dict = {},
         max_num_tokens: int = 49152,
+    ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
             sampler_params (dict): Additional parameters for the sampler.
         """
         # LR
+        noise = SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model_lr.in_channels).to(self.device),
             coords=coords,
         )
             hr_resolution -= 128
         # Sample structured latent
+        noise = SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
             coords=coords,
         )
     def decode_shape_slat(
         self,
+        slat: SparseTensor,
         resolution: int,
+    ) -> Tuple[List[Mesh], List[SparseTensor]]:
         """
         Decode the structured latent.
         Args:
+            slat (SparseTensor): The structured latent.
             formats (List[str]): The formats to decode the structured latent to.
         Returns:
             List[Mesh]: The decoded meshes.
+            List[SparseTensor]: The decoded substructures.
         """
         self.models['shape_slat_decoder'].set_resolution(resolution)
         if self.low_vram:
         self,
         cond: dict,
         flow_model,
+        shape_slat: SparseTensor,
         sampler_params: dict = {},
+    ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
+            shape_slat (SparseTensor): The structured latent for shape
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample structured latent
     def decode_tex_slat(
         self,
+        slat: SparseTensor,
+        subs: List[SparseTensor],
+    ) -> SparseTensor:
         """
         Decode the structured latent.
         Args:
+            slat (SparseTensor): The structured latent.
             formats (List[str]): The formats to decode the structured latent to.
         Returns:
+            List[SparseTensor]: The decoded texture voxels
         """
         if self.low_vram:
             self.models['tex_slat_decoder'].to(self.device)
     @torch.no_grad()
     def decode_latent(
         self,
+        shape_slat: SparseTensor,
+        tex_slat: SparseTensor,
         resolution: int,
     ) -> List[MeshWithVoxel]:
         """
         Decode the latent codes.
         Args:
+            shape_slat (SparseTensor): The structured latent for shape.
+            tex_slat (SparseTensor): The structured latent for texture.
             resolution (int): The resolution of the output.
         """
         meshes, subs = self.decode_shape_slat(shape_slat, resolution)