[linux-sd-webui]图生文，blip/deepbooru

GitHub - pharmapsychotic/clip-interrogator: Image to prompt with BLIP and CLIPImage to prompt with BLIP and CLIP. Contribute to pharmapsychotic/clip-interrogator development by creating an account on GitHub.https://github.com/pharmapsychotic/clip-interrogator GitHub - salesforce/LAVIS: LAVIS - A One-stop Library for Language-Vision IntelligenceLAVIS - A One-stop Library for Language-Vision Intelligence - GitHub - salesforce/LAVIS: LAVIS - A One-stop Library for Language-Vision Intelligencehttps://github.com/salesforce/LAVIS clip_interrogator教程 - 知乎同步发布在我的博客 https://blog.thisis.plus/2023/04/22/clip_interrogator%E6%95%99%E7%A8%8B/文字生成图片是近年来多模态和大模型研究的热门方向，openai提出的CLIP提供了一个方法建立起了图片和文字的联系，…https://zhuanlan.zhihu.com/p/624066332 模型方法---图像生成文字clip-interrogator - 知乎前言最近大火的方法stable-diffusion方法，将文字转成图片。那么有没有相反的方法，有了图片给一段文字描述？其实这个类似Clip里的相似度，但那个还是需要提供几个一段文字描述，而不能智能化的自动生成。但新的…https://zhuanlan.zhihu.com/p/578505705 scripts/clip_interrogator_ext.py · db/clip-interrogator-ext - Gitee.comhttps://gitee.com/dbscholar0/clip-interrogator-ext/blob/main/scripts/clip_interrogator_ext.pyblip是个多模态的视觉-语言模型，在webui中使用了blipv1，目前blip已经有v2版本了，deepbooru适合二次元的场景，除此之外的场景建议使用blip，blip有两个版本，GitHub - pharmapsychotic/clip-interrogator: Image to prompt with BLIP and CLIP，还有一个原作者团队整合的GitHub - salesforce/LAVIS: LAVIS - A One-stop Library for Language-Vision Intelligence，就是把训练的代码都放在这个库里面了，包含了blipv1/v2。

1.GitHub - pharmapsychotic/clip-interrogator: Image to prompt with BLIP and CLIP

这个库也支持blipv1/v2，对齐sd中的功能，纯推理，其中核心用的是hugging face中transformers库的BlipForConditionalGeneration，Blip2ForConditionalGeneration。

config=Config->
ci=Interrogator(config)->clip_interrogator.clip_interrogator->load_caption_model()->load_clip_model()->
- tokenize=open_clip.get_tokenizer(clip_model_name)->
inference(ci,image,mode)->
Interrogator.interrogate()->
caption=caption or self.generate_caption(image)->
- self._prepare_caption()->
- self.caption_model=self.caption_model()->
inputs=self.caption_processor(pil_image)->
- transformers.models.blip.processing_blip.BlipProcessor.__call__->
- encoding_image_processor=self.image_processor(images)->
tokens=self.caption_model.generate(inputs[1,3,384,384],self.config.caption_max_length)[BlipForConditioalGeneration].generate()->
- vision_outputs=self.vision_model(pixel_values)->
- image_embeds=vision_outputs[0](1,577,1024)->
- outputs=self.text_decoder.generate()[transformers.generation.utils.py->GenerationMaxin]->
-- model_kwargd['attention_mask']=self._prepare_attention_mask_for_generation(input)->
-- logits_processor=self._get_logits_processor()->
-- stopping_criteria=self._get_stopping_crireria()->
-- return self.greedy_search()->
--- outputs=self(**model_inputs,...)->
--- blip.modeling_blip_text.BlipTextLMHeadModel.forward()->
--- outputs=self.bert()->outputs:[last_hidden_state,past_key_values]->
--- sequence_output=outputs[0] [1,1,768]->
--- prediction_scores=self.cls(sequence_output)->
---- BlipTextOnlyMLMHead.forward()->
---- BlipTextLMPredictionHead().forward()->transformer->decoder->
---- prediction_scores [1,1,30524]
--- blip.modeling_outputs.CausalLMOutputWithCrossAttention()->
-- next_token_logitrs=output.logits[:,-1,:]->
-- next_tokens_scores=logits_processor(input_ids,next_token_logits)->
-- next_tokens=torch.argmax(next_tokens_scores,dim=-1) [1] ->
-- input_ids [1,11]->
-self.caption_processor.batch_decode(tokens)->
-- blip.processing_blip.BlipProcessor.batch_decode()->
-- tokenization_utils_base.PreTrainedTokenizerBase().batch_decode()->
-- tokenization_utils_fast.PretrainedTokenizerFast()._decode()->
--- text=self._tokenizer.decode()->
image_features=self.image_to_features(image)->

模型的输出包括多个方面：

people walking around a building with a glass facade, rendering of the windigo, stroopwaffel, grand library, inspired by Peter Fiore, detailled light, h 1024, archviz, inspired by Lodewijk Bruckman, the photo shows a large, librarian, soft curvy shape, phase 2, clogs

在上面大段的描述中，除了第一逗号之前的通过图像上物体和位置信息生成的，后面的描述都是通过四个数据集中筛选出与图的特征相似度很高的结果进行排序的，具体数据集合有：

artists flavors mediums movements

其中artists里面都是画家、mediums和movements就是属于那种画风种类，和那种派别的。

而flavors中有很多描述的信息，可以快速找到合适的信息做CLIP计算获得最佳结果。第一句是blip生成的。

上面这些在stable-diffusion-webui中是没有的，webui中只到LMmodel生成的promot就结束，不会在用clip计算相似度找5个类别的词了。

install

主要还是open_clip_torch这个库的问题

cp -r /home/sniss/.local/lib/python3.7/site-packages/open_clip_torch-2.16.0-py3.7.egg/open_clip/openai.py /home/sniss/local_disk/

在57行改下：

#     if get_pretrained_url(name, 'openai'):
#         model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
#     elif os.path.isfile(name):
#         model_path = name
#     else:
#         raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
    model_path = cache_dir

cp -r openai.py /home/sniss/.local/lib/python3.7/site-packages/open_clip_torch-2.16.0-py3.7.egg/open_clip/openai.py

2.deepdanbooru

# from AUTOMATC1111
# maybe modified by Nyanko Lepsoni
# modified by crosstyan
import os.path
import re
import tempfile
import argparse
import glob
import zipfile
import deepdanbooru as dd
import tensorflow as tf
import numpy as np

from basicsr.utils.download_util import load_file_from_url
from PIL import Image
from tqdm import tqdm

re_special = re.compile(r"([\\()])")

def get_deepbooru_tags_model(model_path: str):
    if not os.path.exists(os.path.join(model_path, "project.json")):
        is_abs = os.path.isabs(model_path)
        if not is_abs:
            model_path = os.path.abspath(model_path)
            
        load_file_from_url(
            r"https://github.com/KichangKim/DeepDanbooru/releases/download/v3-20211112-sgd-e28/deepdanbooru-v3-20211112-sgd-e28.zip",
            model_path,
        )
        with zipfile.ZipFile(
            os.path.join(model_path, "deepdanbooru-v3-20211112-sgd-e28.zip"), "r"
        ) as zip_ref:
            zip_ref.extractall(model_path)
        os.remove(os.path.join(model_path, "deepdanbooru-v3-20211112-sgd-e28.zip"))

    tags = dd.project.load_tags_from_project(model_path)
    model = dd.project.load_model_from_project(model_path, compile_model=False)
    return model, tags


def get_deepbooru_tags_from_model(
    model,
    tags,
    pil_image,
    threshold,
    alpha_sort=False,
    use_spaces=True,
    use_escape=True,
    include_ranks=False,
):
    width = model.input_shape[2]
    height = model.input_shape[1]
    image = np.array(pil_image)
    image = tf.image.resize(
        image,
        size=(height, width),
        method=tf.image.ResizeMethod.AREA,
        preserve_aspect_ratio=True,
    )
    image = image.numpy()  # EagerTensor to np.array
    image = dd.image.transform_and_pad_image(image, width, height)
    image = image / 255.0
    image_shape = image.shape
    image = image.reshape((1, image_shape[0], image_shape[1], image_shape[2]))

    y = model.predict(image)[0]

    result_dict = {}

    for i, tag in enumerate(tags):
        result_dict[tag] = y[i]

    unsorted_tags_in_theshold = []
    result_tags_print = []
    for tag in tags:
        if result_dict[tag] >= threshold:
            if tag.startswith("rating:"):
                continue
            unsorted_tags_in_theshold.append((result_dict[tag], tag))
            result_tags_print.append(f"{result_dict[tag]} {tag}")

    # sort tags
    result_tags_out = []
    sort_ndx = 0
    if alpha_sort:
        sort_ndx = 1

    # sort by reverse by likelihood and normal for alpha, and format tag text as requested
    unsorted_tags_in_theshold.sort(key=lambda y: y[sort_ndx], reverse=(not alpha_sort))
    for weight, tag in unsorted_tags_in_theshold:
        tag_outformat = tag
        if use_spaces:
            tag_outformat = tag_outformat.replace("_", " ")
        if use_escape:
            tag_outformat = re.sub(re_special, r"\\\1", tag_outformat)
        if include_ranks:
            tag_outformat = f"({tag_outformat}:{weight:.3f})"

        result_tags_out.append(tag_outformat)

    # print("\n".join(sorted(result_tags_print, reverse=True)))

    return ", ".join(result_tags_out)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--path", type=str, default="./images/")
    parser.add_argument("--threshold", type=int, default=0.75)
    parser.add_argument("--alpha_sort", type=bool, default=False)
    parser.add_argument("--use_spaces", type=bool, default=True)
    parser.add_argument("--use_escape", type=bool, default=True)
    parser.add_argument("--model_path", type=str, default="./deepdanbooru-models")
    parser.add_argument("--include_ranks", type=bool, default=False)

    args = parser.parse_args()

#     global model_path
#     model_path:str
    if args.model_path == "":
        script_path = os.path.realpath(__file__)
        default_model_path = os.path.join(os.path.dirname(script_path), "deepdanbooru-models")
        # print("No model path specified, using default model path: {}".format(default_model_path))
        model_path = default_model_path
    else:
        model_path = args.model_path

    types = ('*.jpg', '*.png', '*.jpeg', '*.gif', '*.webp', '*.bmp') 
    files_grabbed = []
    for files in types:
        files_grabbed.extend(glob.glob(os.path.join(args.path, files)))
        # print(glob.glob(args.path + files))
        
    model, tags = get_deepbooru_tags_model(model_path)
    for image_path in tqdm(files_grabbed, desc="Processing"):
        image = Image.open(image_path).convert("RGB")
        prompt = get_deepbooru_tags_from_model(
            model,
            tags,
            image,
            args.threshold,
            alpha_sort=args.alpha_sort,
            use_spaces=args.use_spaces,
            use_escape=args.use_escape,
            include_ranks=args.include_ranks,
        )
        image_name = os.path.splitext(os.path.basename(image_path))[0]
        txt_filename = os.path.join(args.path, f"{image_name}.txt")
        # print(f"writing {txt_filename}: {prompt}")
        with open(txt_filename, 'w') as f:
            f.write(prompt)

[linux-sd-webui]图生文，blip/deepbooru

猜你喜欢