按照源码进行部署,方便接口调用。
环境:python37+cuda10.1+nvdia driver 418.87
1.安装
torch==1.8.1_cu101
torchvision==0.9.1_cu101
tokenizers==0.11.6
transformers==4.20.0
huggingface-hub==0.2.0
timm
ftfy
sentencepiece
open_clip==2.16.0 自行编译一下,在git上下载源码,用python setup.py install --user安装,在requirement中降低pytorch的版本,改成>1.8
opencv-python
pytorch_lightning
更新libstdc++.so.6 1.3.9
更新glibc 2.18
addict
yapf
prettytable
omegaconf==2.1.1
xformers==0.0.2
更改ldm/modules/encoders/modules.py中88行的openai的clip的地址
有几处autocast和pytorch1.8其冲突的几个,全部删掉autocast的推理即可。
ldm/modules/diffusionmodules/util.py", line 126 注掉autocast,
ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
# "dtype": torch.get_autocast_gpu_dtype(),
# 'dtype':torch.cuda.amp(),
# "cache_enabled": torch.is_autocast_cache_enabled()
}
ldm/modules/attention.py", line 175,直接注掉
from share import *
import config
import cv2
import einops
# import gradio as gr
import numpy as np
import torch
import random
from PIL import Image
import time
from pytorch_lightning import seed_everything
from annotator.util import resize_image, HWC3
from annotator.uniformer import UniformerDetector
from cldm.model import create_model, load_state_dict
from cldm.ddim_hacked import DDIMSampler
apply_uniformer = UniformerDetector()
model = create_model('./models/cldm_v15.yaml').cpu()
model.load_state_dict(load_state_dict('./models/control_sd15_seg.pth', location='cuda'))
model = model.cuda()
ddim_sampler = DDIMSampler(model)
def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength,
scale, seed, eta):
with torch.no_grad():
# import pdb;pdb.set_trace()
input_image = HWC3(input_image)
detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
# import pdb;pdb.set_trace()
detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
control = torch.stack([control for _ in range(num_samples)], dim=0)
control = einops.rearrange(control, 'b h w c -> b c h w').clone()
if seed == -1:
seed = random.randint(0, 65535)
seed_everything(seed)
if config.save_memory:
model.low_vram_shift(is_diffusing=False)
cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
shape = (4, H // 8, W // 8)
if config.save_memory:
model.low_vram_shift(is_diffusing=True)
model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else (
[strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
shape, cond, verbose=False, eta=eta,
unconditional_guidance_scale=scale,
unconditional_conditioning=un_cond)
if config.save_memory:
model.low_vram_shift(is_diffusing=False)
x_samples = model.decode_first_stage(samples)
x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
results = [x_samples[i] for i in range(num_samples)]
return [detected_map] + results
# Modern style -- 现代风格
# European style -- 欧式风格
# Chinese style -- 中式风格
# Mediterranean style -- 地中海风格
# Industrial style -- 工业风格
# Nordic style -- 北欧风格
# Bohemian style -- 波西米亚风格
# living room
# Dining room
# office
# bedroom
# bathroom
# gaming room
img = "room.png"
prompt = "Modern style "
a_prompt = "best quality, extremely detailed"
n_prompt = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"
num_samples = 1
image_resolution = 512
detect_resolution = 512 # 512
ddim_steps = 20
guess_mode = False
strength = 1.0
scale = 9.0
seed = 2023
eta = 0.0
input_image = np.array(Image.open(img))
start = time.time()
for i, prompt in enumerate([
"Modern style", "European style", "Chinese style", "Mediterranean style", "Industrial style", "Nordic style", "Bohemian style","Italian style","French style","Cream style","wood style","Romanesque style","Japanese style","American style","Metal style","pastoral style","country style","Luxury style","Southeast Asian style","Postmodern style","National style"]):
image = process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution,
ddim_steps, guess_mode, strength, scale, seed, eta)
cv2.imwrite(f"out_seg_{prompt}.png", image[0])
cv2.imwrite(f"out_{prompt}.png", image[1][:,:,::-1])
2. 更新oneformer的seg推理
hydra-core==1.1.2
detectron2:自行去git上下载detectron2的源码编译,python -m pip install -e detectron2
natten==0.14.1
编译一下oneformer的扩展模块,在controlnet/annotator/Oneformer/oneformer/modeling/pixel-decoder/ops中,直接运行python setup.py install --user.
wandb
diffdist
import torch
# print("Installed the dependencies!")
import numpy as np
from PIL import Image
import cv2
# import imutils
from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.data import MetadataCatalog
from annotator.OneFormer.oneformer import (
add_oneformer_config,
add_common_config,
add_swin_config,
add_dinat_config,
)
from annotator.OneFormer.demo.defaults import DefaultPredictor
from annotator.OneFormer.demo.visualizer import Visualizer, ColorMode
# import gradio as gr
from huggingface_hub import hf_hub_download
from annotator.OneFormer.demo.visualizer import Visualizer
KEY_DICT = {"Cityscapes (19 classes)": "cityscapes",
"COCO (133 classes)": "coco",
"ADE20K (150 classes)": "ade20k", }
SWIN_CFG_DICT = {"cityscapes": "configs/cityscapes/oneformer_swin_large_IN21k_384_bs16_90k.yaml",
"coco": "configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml",
"ade20k": "configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml", }
SWIN_MODEL_DICT = {
"cityscapes": hf_hub_download(repo_id="shi-labs/oneformer_cityscapes_swin_large",
filename="250_16_swin_l_oneformer_cityscapes_90k.pth"),
"coco": hf_hub_download(repo_id="shi-labs/oneformer_coco_swin_large",
filename="150_16_swin_l_oneformer_coco_100ep.pth"),
"ade20k": hf_hub_download(repo_id="shi-labs/oneformer_ade20k_swin_large",
filename="250_16_swin_l_oneformer_ade20k_160k.pth")
}
DINAT_CFG_DICT = {"cityscapes": "configs/cityscapes/oneformer_dinat_large_bs16_90k.yaml",
"coco": "configs/coco/oneformer_dinat_large_bs16_100ep.yaml",
"ade20k": "configs/ade20k/oneformer_dinat_large_IN21k_384_bs16_160k.yaml", }
DINAT_MODEL_DICT = {"cityscapes": hf_hub_download(repo_id="shi-labs/oneformer_cityscapes_dinat_large",
filename="250_16_dinat_l_oneformer_cityscapes_90k.pth"),
"coco": hf_hub_download(repo_id="shi-labs/oneformer_coco_dinat_large",
filename="150_16_dinat_l_oneformer_coco_100ep.pth"),
"ade20k": hf_hub_download(repo_id="shi-labs/oneformer_ade20k_dinat_large",
filename="250_16_dinat_l_oneformer_ade20k_160k.pth")
}
MODEL_DICT = {"DiNAT-L": DINAT_MODEL_DICT,
"Swin-L": SWIN_MODEL_DICT}
CFG_DICT = {"DiNAT-L": DINAT_CFG_DICT,
"Swin-L": SWIN_CFG_DICT}
WIDTH_DICT = {"cityscapes": 512,
"coco": 512,
"ade20k": 640}
cpu_device = torch.device("cpu")
PREDICTORS = {
"DiNAT-L": {
"Cityscapes (19 classes)": None,
"COCO (133 classes)": None,
"ADE20K (150 classes)": None
},
"Swin-L": {
"Cityscapes (19 classes)": None,
"COCO (133 classes)": None,
"ADE20K (150 classes)": None
}
}
METADATA = {
"DiNAT-L": {
"Cityscapes (19 classes)": None,
"COCO (133 classes)": None,
"ADE20K (150 classes)": None
},
"Swin-L": {
"Cityscapes (19 classes)": None,
"COCO (133 classes)": None,
"ADE20K (150 classes)": None
}
}
def setup_modules():
for dataset in ["Cityscapes (19 classes)", "COCO (133 classes)", "ADE20K (150 classes)"]:
for backbone in ["DiNAT-L", "Swin-L"]:
cfg = setup_cfg(dataset, backbone)
metadata = MetadataCatalog.get(
cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused"
)
if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST_PANOPTIC[0]:
from cityscapesscripts.helpers.labels import labels
stuff_colors = [k.color for k in labels if k.trainId != 255]
metadata = metadata.set(stuff_colors=stuff_colors)
PREDICTORS[backbone][dataset] = DefaultPredictor(cfg)
METADATA[backbone][dataset] = metadata
def setup_cfg(dataset, backbone):
# load config from file and command-line arguments
cfg = get_cfg()
add_deeplab_config(cfg)
add_common_config(cfg)
add_swin_config(cfg)
add_oneformer_config(cfg)
add_dinat_config(cfg)
dataset = KEY_DICT[dataset]
cfg_path = CFG_DICT[backbone][dataset]
cfg.merge_from_file(cfg_path)
if torch.cuda.is_available():
cfg.MODEL.DEVICE = 'cuda'
else:
cfg.MODEL.DEVICE = 'cpu'
cfg.MODEL.WEIGHTS = MODEL_DICT[backbone][dataset]
cfg.freeze()
return cfg
setup_modules()
class OneformerDetector:
def __init__(self, ):
backbone = "DiNAT-L"
dataset = "ADE20K (150 classes)"
self.predictor = PREDICTORS[backbone][dataset]
self.metadata = METADATA[backbone][dataset]
def __call__(self, img):
visualizer = Visualizer(img[:, :, ::-1], metadata=self.metadata, instance_mode=ColorMode.IMAGE)
predictions = self.predictor(img, "semantic")
out = visualizer.draw_sem_seg(
predictions["sem_seg"].argmax(dim=0).to(cpu_device), alpha=0.5
)
visualizer_map = Visualizer(img[:, :, ::-1], is_img=False, metadata=self.metadata, instance_mode=ColorMode.IMAGE)
out_map = visualizer_map.draw_sem_seg(
predictions["sem_seg"].argmax(dim=0).to(cpu_device), alpha=1, is_text=False
)
return out_map