diff --git a/gradio_app.py b/gradio_app.py index 5c0c63c5..dc7e42d9 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -37,7 +37,7 @@ parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied") # network backbone parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training") -parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]") +parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, vanilla]") # rendering resolution in training, decrease this if CUDA OOM. parser.add_argument('--w', type=int, default=64, help="render width for NeRF in training") parser.add_argument('--h', type=int, default=64, help="render height for NeRF in training") @@ -78,8 +78,6 @@ if opt.backbone == 'vanilla': from nerf.network import NeRFNetwork -elif opt.backbone == 'tcnn': - from nerf.network_tcnn import NeRFNetwork elif opt.backbone == 'grid': from nerf.network_grid import NeRFNetwork else: diff --git a/main.py b/main.py index ae415ffd..06cfb343 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,7 @@ parser = argparse.ArgumentParser() parser.add_argument('--text', default=None, help="text prompt") + parser.add_argument('--negative', default='', type=str, help="negative text prompt") parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text") parser.add_argument('-O2', action='store_true', help="equals --fp16 --dir_text") parser.add_argument('--test', action='store_true', help="test mode") @@ -38,7 +39,7 @@ parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied") # network backbone parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training") - parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]") + parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, vanilla]") # rendering resolution in training, decrease this if CUDA OOM. parser.add_argument('--w', type=int, default=64, help="render width for NeRF in training") parser.add_argument('--h', type=int, default=64, help="render height for NeRF in training") @@ -51,12 +52,14 @@ parser.add_argument('--radius_range', type=float, nargs='*', default=[1.0, 1.5], help="training camera radius range") parser.add_argument('--fovy_range', type=float, nargs='*', default=[40, 70], help="training camera fovy range") parser.add_argument('--dir_text', action='store_true', help="direction-encode the text prompt, by appending front/side/back/overhead view") + parser.add_argument('--negative_dir_text', action='store_true', help="also use negative dir text prompt.") parser.add_argument('--angle_overhead', type=float, default=30, help="[0, angle_overhead] is the overhead region") parser.add_argument('--angle_front', type=float, default=60, help="[0, angle_front] is the front region, [180, 180+angle_front] the back region, otherwise the side region.") parser.add_argument('--lambda_entropy', type=float, default=1e-4, help="loss scale for alpha entropy") parser.add_argument('--lambda_opacity', type=float, default=0, help="loss scale for alpha value") parser.add_argument('--lambda_orient', type=float, default=1e-2, help="loss scale for orientation") + parser.add_argument('--lambda_smooth', type=float, default=0, help="loss scale for orientation") ### GUI options parser.add_argument('--gui', action='store_true', help="start a GUI") @@ -73,21 +76,22 @@ if opt.O: opt.fp16 = True opt.dir_text = True - # use occupancy grid to prune ray sampling, faster rendering. + opt.negative_dir_text = True opt.cuda_ray = True + # opt.lambda_entropy = 1e-4 # opt.lambda_opacity = 0 elif opt.O2: opt.fp16 = True opt.dir_text = True + opt.negative_dir_text = True + opt.lambda_entropy = 1e-4 # necessary to keep non-empty opt.lambda_opacity = 3e-3 # no occupancy grid, so use a stronger opacity loss. if opt.backbone == 'vanilla': from nerf.network import NeRFNetwork - elif opt.backbone == 'tcnn': - from nerf.network_tcnn import NeRFNetwork elif opt.backbone == 'grid': from nerf.network_grid import NeRFNetwork else: diff --git a/nerf/clip.py b/nerf/clip.py index 23a03240..f33258aa 100644 --- a/nerf/clip.py +++ b/nerf/clip.py @@ -23,7 +23,9 @@ def __init__(self, device): # self.gaussian_blur = T.GaussianBlur(15, sigma=(0.1, 10)) - def get_text_embeds(self, prompt): + def get_text_embeds(self, prompt, negative_prompt): + + # NOTE: negative_prompt is ignored for CLIP. text = clip.tokenize(prompt).to(self.device) text_z = self.clip_model.encode_text(text) diff --git a/nerf/gui.py b/nerf/gui.py index fe340835..335697d7 100644 --- a/nerf/gui.py +++ b/nerf/gui.py @@ -175,6 +175,9 @@ def register_dpg(self): # text prompt if self.opt.text is not None: dpg.add_text("text: " + self.opt.text, tag="_log_prompt_text") + + if self.opt.negative != '': + dpg.add_text("negative text: " + self.opt.negative, tag="_log_prompt_negative_text") # button theme with dpg.theme() as theme_button: diff --git a/nerf/network.py b/nerf/network.py index 8a7c8e6e..eb2c5217 100644 --- a/nerf/network.py +++ b/nerf/network.py @@ -97,6 +97,19 @@ def finite_difference_normal(self, x, epsilon=1e-2): return normal + def normal(self, x): + + with torch.enable_grad(): + x.requires_grad_(True) + sigma, albedo = self.common_forward(x) + # query gradient + normal = - torch.autograd.grad(torch.sum(sigma), x, create_graph=True)[0] # [N, 3] + + # normalize... + normal = safe_normalize(normal) + normal[torch.isnan(normal)] = 0 + return normal + def forward(self, x, d, l=None, ratio=1, shading='albedo'): # x: [N, 3], in [-bound, bound] # d: [N, 3], view direction, nomalized in [-1, 1] diff --git a/nerf/network_grid.py b/nerf/network_grid.py index c6cf436a..c10416fe 100644 --- a/nerf/network_grid.py +++ b/nerf/network_grid.py @@ -46,7 +46,7 @@ def __init__(self, self.num_layers = num_layers self.hidden_dim = hidden_dim - self.encoder, self.in_dim = get_encoder('tiledgrid', input_dim=3, desired_resolution=2048 * self.bound) + self.encoder, self.in_dim = get_encoder('tiledgrid', input_dim=3, log2_hashmap_size=16, desired_resolution=2048 * self.bound) self.sigma_net = MLP(self.in_dim, 4, hidden_dim, num_layers, bias=True) @@ -103,6 +103,16 @@ def finite_difference_normal(self, x, epsilon=1e-2): ], dim=-1) return normal + + + def normal(self, x): + + normal = self.finite_difference_normal(x) + normal = safe_normalize(normal) + normal[torch.isnan(normal)] = 0 + + return normal + def forward(self, x, d, l=None, ratio=1, shading='albedo'): # x: [N, 3], in [-bound, bound] @@ -119,17 +129,7 @@ def forward(self, x, d, l=None, ratio=1, shading='albedo'): # query normal sigma, albedo = self.common_forward(x) - normal = self.finite_difference_normal(x) - - # with torch.enable_grad(): - # x.requires_grad_(True) - # sigma, albedo = self.common_forward(x) - # # query gradient - # normal = - torch.autograd.grad(torch.sum(sigma), x, create_graph=True)[0] # [N, 3] - - # normalize... - normal = safe_normalize(normal) - normal[torch.isnan(normal)] = 0 + normal = self.normal(x) # lambertian shading lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,] diff --git a/nerf/network_tcnn.py b/nerf/network_tcnn.py deleted file mode 100644 index 056ec72b..00000000 --- a/nerf/network_tcnn.py +++ /dev/null @@ -1,174 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from activation import trunc_exp -from .renderer import NeRFRenderer -from encoding import get_encoder - -import numpy as np -import tinycudann as tcnn - -class MLP(nn.Module): - def __init__(self, dim_in, dim_out, dim_hidden, num_layers, bias=True): - super().__init__() - self.dim_in = dim_in - self.dim_out = dim_out - self.dim_hidden = dim_hidden - self.num_layers = num_layers - - net = [] - for l in range(num_layers): - net.append(nn.Linear(self.dim_in if l == 0 else self.dim_hidden, self.dim_out if l == num_layers - 1 else self.dim_hidden, bias=bias)) - - self.net = nn.ModuleList(net) - - def forward(self, x): - for l in range(self.num_layers): - x = self.net[l](x) - if l != self.num_layers - 1: - x = F.relu(x, inplace=True) - return x - - -class NeRFNetwork(NeRFRenderer): - def __init__(self, - opt, - num_layers=3, - hidden_dim=64, - num_layers_bg=2, - hidden_dim_bg=64, - ): - - super().__init__(opt) - - self.num_layers = num_layers - self.hidden_dim = hidden_dim - - per_level_scale = np.exp2(np.log2(2048 * self.bound / 16) / (16 - 1)) - - self.encoder = tcnn.Encoding( - n_input_dims=3, - encoding_config={ - "otype": "HashGrid", - "n_levels": 16, - "n_features_per_level": 2, - "log2_hashmap_size": 19, - "base_resolution": 16, - "per_level_scale": per_level_scale, - }, - ) - - self.sigma_net = MLP(32, 4, hidden_dim, num_layers, bias=True) - - # background network - if self.bg_radius > 0: - self.num_layers_bg = num_layers_bg - self.hidden_dim_bg = hidden_dim_bg - - self.encoder_bg, self.in_dim_bg = get_encoder('frequency', input_dim=3) - - self.bg_net = MLP(self.in_dim_bg, 3, hidden_dim_bg, num_layers_bg, bias=True) - - else: - self.bg_net = None - - def gaussian(self, x): - # x: [B, N, 3] - - d = (x ** 2).sum(-1) - g = 5 * torch.exp(-d / (2 * 0.2 ** 2)) - - return g - - def common_forward(self, x): - # x: [N, 3], in [-bound, bound] - - # sigma - h = (x + self.bound) / (2 * self.bound) # to [0, 1] - h = self.encoder(h) - - h = self.sigma_net(h) - - sigma = trunc_exp(h[..., 0] + self.gaussian(x)) - albedo = torch.sigmoid(h[..., 1:]) - - return sigma, albedo - - - def forward(self, x, d, l=None, ratio=1, shading='albedo'): - # x: [N, 3], in [-bound, bound] - # d: [N, 3], view direction, nomalized in [-1, 1] - # l: [3], plane light direction, nomalized in [-1, 1] - # ratio: scalar, ambient ratio, 1 == no shading (albedo only) - - if shading == 'albedo': - # no need to query normal - sigma, color = self.common_forward(x) - normal = None - - else: - # query normal - has_grad = torch.is_grad_enabled() - - with torch.enable_grad(): - x.requires_grad_(True) - sigma, albedo = self.common_forward(x) - # query gradient - normal = torch.autograd.grad(torch.sum(sigma), x, create_graph=True)[0] # [N, 3] - - # normalize... - normal = normal / (torch.norm(normal, dim=-1, keepdim=True) + 1e-9) - normal[torch.isnan(normal)] = 0 - - if not has_grad: - normal = normal.detach() - - # lambertian shading - lambertian = ratio + (1 - ratio) * (normal @ l).clamp(min=0) # [N,] - - if shading == 'textureless': - color = lambertian.unsqueeze(-1).repeat(1, 3) - elif shading == 'normal': - color = (normal + 1) / 2 - else: # 'lambertian' - color = albedo * lambertian.unsqueeze(-1) - - return sigma, color, normal - - - def density(self, x): - # x: [N, 3], in [-bound, bound] - - sigma, _ = self.common_forward(x) - - return { - 'sigma': sigma - } - - - def background(self, d): - # x: [N, 2], in [-1, 1] - - h = self.encoder_bg(d) # [N, C] - - h = self.bg_net(h) - - # sigmoid activation for rgb - rgbs = torch.sigmoid(h) - - return rgbs - - # optimizer utils - def get_params(self, lr): - - params = [ - {'params': self.encoder.parameters(), 'lr': lr * 10}, - {'params': self.sigma_net.parameters(), 'lr': lr}, - ] - - if self.bg_radius > 0: - params.append({'params': self.encoder_bg.parameters(), 'lr': lr * 10}) - params.append({'params': self.bg_net.parameters(), 'lr': lr}) - - return params \ No newline at end of file diff --git a/nerf/renderer.py b/nerf/renderer.py index 701a972c..ee50d3e6 100644 --- a/nerf/renderer.py +++ b/nerf/renderer.py @@ -399,14 +399,17 @@ def run(self, rays_o, rays_d, num_steps=128, upsample_steps=128, light_d=None, a sigmas, rgbs, normals = self(xyzs.reshape(-1, 3), dirs.reshape(-1, 3), light_d, ratio=ambient_ratio, shading=shading) rgbs = rgbs.view(N, -1, 3) # [N, T+t, 3] - #print(xyzs.shape, 'valid_rgb:', mask.sum().item()) - # orientation loss if normals is not None: + # orientation loss normals = normals.view(N, -1, 3) - # print(weights.shape, normals.shape, dirs.shape) loss_orient = weights.detach() * (normals * dirs).sum(-1).clamp(min=0) ** 2 results['loss_orient'] = loss_orient.mean() + # surface normal smoothness + normals_perturb = self.normal(xyzs + torch.randn_like(xyzs) * 1e-2).view(N, -1, 3) + loss_smooth = (normals - normals_perturb).abs() + results['loss_smooth'] = loss_smooth.mean() + # calculate weight_sum (mask) weights_sum = weights.sum(dim=-1) # [N] @@ -478,12 +481,18 @@ def run_cuda(self, rays_o, rays_d, dt_gamma=0, light_d=None, ambient_ratio=1.0, weights_sum, depth, image = raymarching.composite_rays_train(sigmas, rgbs, deltas, rays, T_thresh) - # orientation loss + # normals related regularizations if normals is not None: + # orientation loss weights = 1 - torch.exp(-sigmas) loss_orient = weights.detach() * (normals * dirs).sum(-1).clamp(min=0) ** 2 results['loss_orient'] = loss_orient.mean() + # surface normal smoothness + normals_perturb = self.normal(xyzs + torch.randn_like(xyzs) * 1e-2) + loss_smooth = (normals - normals_perturb).abs() + results['loss_smooth'] = loss_smooth.mean() + else: # allocate outputs diff --git a/nerf/sd.py b/nerf/sd.py index 1d922350..31497e56 100644 --- a/nerf/sd.py +++ b/nerf/sd.py @@ -10,6 +10,12 @@ import time +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = True + class StableDiffusion(nn.Module): def __init__(self, device): super().__init__() @@ -30,14 +36,14 @@ def __init__(self, device): print(f'[INFO] loading stable diffusion...') # 1. Load the autoencoder model which will be used to decode the latents into image space. - self.vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_auth_token=self.token).to(self.device) + self.vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae", use_auth_token=self.token).to(self.device) # 2. Load the tokenizer and text encoder to tokenize and encode the text. self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") self.text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device) # 3. The UNet model for generating the latents. - self.unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", use_auth_token=self.token).to(self.device) + self.unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", use_auth_token=self.token).to(self.device) # 4. Create a scheduler for inference self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=self.num_train_timesteps) @@ -45,7 +51,9 @@ def __init__(self, device): print(f'[INFO] loaded stable diffusion!') - def get_text_embeds(self, prompt): + def get_text_embeds(self, prompt, negative_prompt): + # prompt, negative_prompt: [str] + # Tokenize text and get embeddings text_input = self.tokenizer(prompt, padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt') @@ -53,7 +61,7 @@ def get_text_embeds(self, prompt): text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0] # Do the same for unconditional embeddings - uncond_input = self.tokenizer([''] * len(prompt), padding='max_length', max_length=self.tokenizer.model_max_length, return_tensors='pt') + uncond_input = self.tokenizer(negative_prompt, padding='max_length', max_length=self.tokenizer.model_max_length, return_tensors='pt') with torch.no_grad(): uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] @@ -155,13 +163,16 @@ def encode_imgs(self, imgs): return latents - def prompt_to_img(self, prompts, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None): + def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None): if isinstance(prompts, str): prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] # Prompts -> text embeds - text_embeds = self.get_text_embeds(prompts) # [2, 77, 768] + text_embeds = self.get_text_embeds(prompts, negative_prompts) # [2, 77, 768] # Text embeds -> img latents latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale) # [1, 4, 64, 64] @@ -183,16 +194,20 @@ def prompt_to_img(self, prompts, height=512, width=512, num_inference_steps=50, parser = argparse.ArgumentParser() parser.add_argument('prompt', type=str) + parser.add_argument('--negative', default='', type=str) parser.add_argument('-H', type=int, default=512) parser.add_argument('-W', type=int, default=512) + parser.add_argument('--seed', type=int, default=0) parser.add_argument('--steps', type=int, default=50) opt = parser.parse_args() + seed_everything(opt.seed) + device = torch.device('cuda') sd = StableDiffusion(device) - imgs = sd.prompt_to_img(opt.prompt, opt.H, opt.W, opt.steps) + imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps) # visualize image plt.imshow(imgs[0]) diff --git a/nerf/utils.py b/nerf/utils.py index 7af9228c..6d32a962 100644 --- a/nerf/utils.py +++ b/nerf/utils.py @@ -296,12 +296,26 @@ def prepare_text_embeddings(self): return if not self.opt.dir_text: - self.text_z = self.guidance.get_text_embeds([self.opt.text]) + self.text_z = self.guidance.get_text_embeds([self.opt.text], [self.opt.negative]) else: self.text_z = [] for d in ['front', 'side', 'back', 'side', 'overhead', 'bottom']: + # construct dir-encoded text text = f"{self.opt.text}, {d} view" - text_z = self.guidance.get_text_embeds([text]) + + negative_text = f"{self.opt.negative}" + + # explicit negative dir-encoded text + if self.opt.negative_dir_text: + if negative_text != '': negative_text += ', ' + + if d == 'back': negative_text += "front view" + elif d == 'front': negative_text += "back view" + elif d == 'side': negative_text += "front view, back view" + elif d == 'overhead': negative_text += "bottom view" + elif d == 'bottom': negative_text += "overhead view" + + text_z = self.guidance.get_text_embeds([text], [negative_text]) self.text_z.append(text_z) def __del__(self): @@ -382,6 +396,10 @@ def train_step(self, data): if self.opt.lambda_orient > 0 and 'loss_orient' in outputs: loss_orient = outputs['loss_orient'] loss = loss + self.opt.lambda_orient * loss_orient + + if self.opt.lambda_smooth > 0 and 'loss_smooth' in outputs: + loss_smooth = outputs['loss_smooth'] + loss = loss + self.opt.lambda_smooth * loss_smooth return pred_rgb, pred_ws, loss diff --git a/readme.md b/readme.md index 52bff045..e49e7688 100644 --- a/readme.md +++ b/readme.md @@ -44,9 +44,6 @@ pip install -r requirements.txt # (optional) install nvdiffrast for exporting textured mesh (--save_mesh) pip install git+https://github.com/NVlabs/nvdiffrast/ -# (optional) install the tcnn backbone if using --tcnn -pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch - # (optional) install CLIP guidance for the dreamfield setting pip install git+https://github.com/openai/CLIP.git @@ -80,6 +77,9 @@ First time running will take some time to compile the CUDA extensions. # `--dir_text` enables view-dependent prompting. python main.py --text "a hamburger" --workspace trial -O +# we also support negative text prompt now: +python main.py --text "a rose" --negative "red" --workspace trial -O + # if the above command fails to generate things (learns an empty scene), maybe try: # 1. disable random lambertian shading, simply use albedo as color: python main.py --text "a hamburger" --workspace trial -O --albedo_iters 10000 # i.e., set --albedo_iters >= --iters, which is default to 10000 @@ -138,7 +138,7 @@ latents.backward(gradient=grad, retain_graph=True) * light direction: current implementation use a plane light source, instead of a point light source... * View-dependent prompting: `./nerf/provider.py > get_view_direction`. * ues `--angle_overhead, --angle_front` to set the border. How to better divide front/back/side regions? -* Network backbone (`./nerf/network*.py`) can be chosen by the `--backbone` option, but `tcnn` and `vanilla` are not well tested. +* Network backbone (`./nerf/network*.py`) can be chosen by the `--backbone` option, but `vanilla` is not well tested. * Spatial density bias (gaussian density blob): `./nerf/network*.py > NeRFNetwork > gaussian`. # Acknowledgement