Spaces:
Sleeping
Sleeping
| # Copyright (C) 2025 NVIDIA Corporation. All rights reserved. | |
| # | |
| # This work is licensed under the LICENSE file | |
| # located at the root directory. | |
| import gc | |
| import torch | |
| from visualization_utils import show_images | |
| def _add_object( | |
| pipe, | |
| prompts, | |
| seed_src, | |
| seed_obj, | |
| extended_scale, | |
| source_latents, | |
| structure_transfer_step, | |
| subject_token, | |
| blend_steps, | |
| show_attention=False, | |
| localization_model="attention_points_sam", | |
| is_img_src=False, | |
| img_src_latents=None, | |
| use_offset=False, | |
| display_output=False, | |
| ): | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| out = pipe( | |
| prompt=prompts, | |
| guidance_scale=3.5 if (not is_img_src) else [1,3.5], | |
| height=1024, | |
| width=1024, | |
| max_sequence_length=512, | |
| num_inference_steps=30, | |
| seed=[seed_src, seed_obj], | |
| # Extended Attention | |
| extended_scale=extended_scale, | |
| extended_steps_multi=10, | |
| extended_steps_single=20, | |
| # Structure Transfer | |
| source_latents=source_latents, | |
| structure_transfer_step=structure_transfer_step, | |
| # Latent Blending | |
| subject_token=subject_token, | |
| localization_model=localization_model, | |
| blend_steps=blend_steps, | |
| show_attention=show_attention, | |
| # Real Image Source | |
| is_img_src=is_img_src, | |
| img_src_latents=img_src_latents, | |
| use_offset=use_offset, | |
| # TQDM | |
| tqdm_desc="Running Addit: Generating Edited Image", | |
| ) | |
| if display_output: | |
| show_images(out.images) | |
| return out.images | |
| def add_object_generated( | |
| pipe, | |
| prompt_source, | |
| prompt_object, | |
| subject_token, | |
| seed_src, | |
| seed_obj, | |
| show_attention=False, | |
| extended_scale=1.05, | |
| structure_transfer_step=2, | |
| blend_steps=[15], | |
| localization_model="attention_points_sam", | |
| display_output=False | |
| ): | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # Generate source image and latents for each seed1 | |
| print('Generating source image...') | |
| source_image, source_latents = pipe( | |
| prompt=[prompt_source], | |
| guidance_scale=3.5, | |
| height=1024, | |
| width=1024, | |
| max_sequence_length=512, | |
| num_inference_steps=30, | |
| seed=[seed_src], | |
| output_type="both", | |
| tqdm_desc="Generating Source Image", | |
| ) | |
| source_image = source_image[0] | |
| # Run the core combination logic | |
| print('Running Addit...') | |
| src_image, edited_image = _add_object( | |
| pipe=pipe, | |
| prompts=[prompt_source, prompt_object], | |
| subject_token=subject_token, | |
| seed_src=seed_src, | |
| seed_obj=seed_obj, | |
| source_latents=source_latents, | |
| structure_transfer_step=structure_transfer_step, | |
| extended_scale=extended_scale, | |
| blend_steps=blend_steps, | |
| show_attention=show_attention, | |
| localization_model=localization_model, | |
| display_output=display_output | |
| ) | |
| return src_image, edited_image | |
| def add_object_real( | |
| pipe, | |
| source_image, | |
| prompt_source, | |
| prompt_object, | |
| subject_token, | |
| seed_src, | |
| seed_obj, | |
| localization_model="attention_points_sam", | |
| extended_scale=1.05, | |
| structure_transfer_step=4, | |
| blend_steps=[20], | |
| use_offset=False, | |
| show_attention=False, | |
| use_inversion=False, | |
| display_output=False | |
| ): | |
| print('Noising-Denoising Original Image') | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # Get initial latents | |
| source_latents = pipe.call_img2img( | |
| prompt=prompt_source, | |
| image=source_image, | |
| num_inference_steps=30, | |
| strength=0.1, | |
| guidance_scale=3.5, | |
| output_type="latent", | |
| generator=torch.Generator(device=pipe.device).manual_seed(0), | |
| tqdm_desc="Encoding Source Image", | |
| ).images | |
| # Optional inversion step | |
| img_src_latents = None | |
| if use_inversion: | |
| print('Inverting Image') | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| latents_list = pipe.call_invert( | |
| prompt=prompt_source, | |
| image=source_latents, | |
| num_inference_steps=30, | |
| guidance_scale=1, | |
| fixed_point_iterations=2, | |
| generator=torch.Generator(device=pipe.device).manual_seed(0), | |
| tqdm_desc="Inverting Source Image", | |
| ) | |
| img_src_latents = [x[0] for x in latents_list][::-1] | |
| print('Running Addit') | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| src_image, edited_image = _add_object( | |
| pipe, | |
| prompts=[prompt_source, prompt_object], | |
| seed_src=seed_src, | |
| seed_obj=seed_obj, | |
| extended_scale=extended_scale, | |
| source_latents=source_latents, | |
| structure_transfer_step=structure_transfer_step, | |
| subject_token=subject_token, | |
| blend_steps=blend_steps, | |
| show_attention=show_attention, | |
| localization_model=localization_model, | |
| is_img_src=True, | |
| img_src_latents=img_src_latents, | |
| use_offset=use_offset, | |
| display_output=display_output, | |
| ) | |
| return src_image, edited_image | |