ComfyUI v0.12.2

Fix crash with ace step 1.5 (#12264 )
mm: Remove Aimdo exemption for empty_cache (#12260 )
2026-02-04 00:08:59 -05:00 · 2026-02-04 00:03:21 -05:00 · 2026-02-03 21:39:19 -05:00 · 2026-02-03 19:01:38 -05:00 · 2026-02-03 15:01:46 -05:00 · 2026-02-03 14:40:45 -05:00
161 changed files with 15099 additions and 1035 deletions
@@ -20,7 +20,7 @@ jobs:
      git_tag: ${{ inputs.git_tag }}
      cache_tag: "cu130"
      python_minor: "13"
-      python_patch: "9"
+      python_patch: "11"
      rel_name: "nvidia"
      rel_extra_name: ""
      test_release: true
@@ -65,11 +65,11 @@ jobs:
      contents: "write"
      packages: "write"
      pull-requests: "read"
-    name: "Release AMD ROCm 7.1.1"
+    name: "Release AMD ROCm 7.2"
    uses: ./.github/workflows/stable-release.yml
    with:
      git_tag: ${{ inputs.git_tag }}
-      cache_tag: "rocm711"
+      cache_tag: "rocm72"
      python_minor: "12"
      python_patch: "10"
      rel_name: "amd"
@@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "11"
 #  push:
 #    branches:
 #      - master
@@ -208,7 +208,7 @@ comfy install

 ## Manual Install (Windows, Linux)

-Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies.
+Python 3.14 works but some custom nodes may have issues. The free threaded variant works but some dependencies will enable the GIL so it's not fully supported.

 Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12

@@ -1,5 +1,8 @@
 import logging
 import uuid
+import urllib.parse
+import os
+import contextlib
 from aiohttp import web

 from pydantic import ValidationError
@@ -8,6 +11,9 @@ import app.assets.manager as manager
 from app import user_manager
 from app.assets.api import schemas_in
 from app.assets.helpers import get_query_dict
+from app.assets.scanner import seed_assets
+
+import folder_paths

 ROUTES = web.RouteTableDef()
 USER_MANAGER: user_manager.UserManager | None = None
@@ -15,6 +21,9 @@ USER_MANAGER: user_manager.UserManager | None = None
 # UUID regex (canonical hyphenated form, case-insensitive)
 UUID_RE = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"

+# Note to any custom node developers reading this code:
+# The assets system is not yet fully implemented, do not rely on the code in /app/assets remaining the same.
+
 def register_assets_system(app: web.Application, user_manager_instance: user_manager.UserManager) -> None:
    global USER_MANAGER
    USER_MANAGER = user_manager_instance
@@ -28,6 +37,18 @@ def _validation_error_response(code: str, ve: ValidationError) -> web.Response:
    return _error_response(400, code, "Validation failed.", {"errors": ve.json()})


+@ROUTES.head("/api/assets/hash/{hash}")
+async def head_asset_by_hash(request: web.Request) -> web.Response:
+    hash_str = request.match_info.get("hash", "").strip().lower()
+    if not hash_str or ":" not in hash_str:
+        return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
+    algo, digest = hash_str.split(":", 1)
+    if algo != "blake3" or not digest or any(c for c in digest if c not in "0123456789abcdef"):
+        return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
+    exists = manager.asset_exists(asset_hash=hash_str)
+    return web.Response(status=200 if exists else 404)
+
+
@ROUTES.get("/api/assets")
 async def list_assets(request: web.Request) -> web.Response:
    """
@@ -50,7 +71,7 @@ async def list_assets(request: web.Request) -> web.Response:
        order=q.order,
        owner_id=USER_MANAGER.get_request_user_id(request),
    )
-    return web.json_response(payload.model_dump(mode="json"))
+    return web.json_response(payload.model_dump(mode="json", exclude_none=True))


@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}")
@@ -76,6 +97,314 @@ async def get_asset(request: web.Request) -> web.Response:
    return web.json_response(result.model_dump(mode="json"), status=200)


+@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}/content")
+async def download_asset_content(request: web.Request) -> web.Response:
+    # question: do we need disposition? could we just stick with one of these?
+    disposition = request.query.get("disposition", "attachment").lower().strip()
+    if disposition not in {"inline", "attachment"}:
+        disposition = "attachment"
+
+    try:
+        abs_path, content_type, filename = manager.resolve_asset_content_for_download(
+            asset_info_id=str(uuid.UUID(request.match_info["id"])),
+            owner_id=USER_MANAGER.get_request_user_id(request),
+        )
+    except ValueError as ve:
+        return _error_response(404, "ASSET_NOT_FOUND", str(ve))
+    except NotImplementedError as nie:
+        return _error_response(501, "BACKEND_UNSUPPORTED", str(nie))
+    except FileNotFoundError:
+        return _error_response(404, "FILE_NOT_FOUND", "Underlying file not found on disk.")
+
+    quoted = (filename or "").replace("\r", "").replace("\n", "").replace('"', "'")
+    cd = f'{disposition}; filename="{quoted}"; filename*=UTF-8\'\'{urllib.parse.quote(filename)}'
+
+    file_size = os.path.getsize(abs_path)
+    logging.info(
+        "download_asset_content: path=%s, size=%d bytes (%.2f MB), content_type=%s, filename=%s",
+        abs_path,
+        file_size,
+        file_size / (1024 * 1024),
+        content_type,
+        filename,
+    )
+
+    async def file_sender():
+        chunk_size = 64 * 1024
+        with open(abs_path, "rb") as f:
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+
+    return web.Response(
+        body=file_sender(),
+        content_type=content_type,
+        headers={
+            "Content-Disposition": cd,
+            "Content-Length": str(file_size),
+        },
+    )
+
+
+@ROUTES.post("/api/assets/from-hash")
+async def create_asset_from_hash(request: web.Request) -> web.Response:
+    try:
+        payload = await request.json()
+        body = schemas_in.CreateFromHashBody.model_validate(payload)
+    except ValidationError as ve:
+        return _validation_error_response("INVALID_BODY", ve)
+    except Exception:
+        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")
+
+    result = manager.create_asset_from_hash(
+        hash_str=body.hash,
+        name=body.name,
+        tags=body.tags,
+        user_metadata=body.user_metadata,
+        owner_id=USER_MANAGER.get_request_user_id(request),
+    )
+    if result is None:
+        return _error_response(404, "ASSET_NOT_FOUND", f"Asset content {body.hash} does not exist")
+    return web.json_response(result.model_dump(mode="json"), status=201)
+
+
+@ROUTES.post("/api/assets")
+async def upload_asset(request: web.Request) -> web.Response:
+    """Multipart/form-data endpoint for Asset uploads."""
+    if not (request.content_type or "").lower().startswith("multipart/"):
+        return _error_response(415, "UNSUPPORTED_MEDIA_TYPE", "Use multipart/form-data for uploads.")
+
+    reader = await request.multipart()
+
+    file_present = False
+    file_client_name: str | None = None
+    tags_raw: list[str] = []
+    provided_name: str | None = None
+    user_metadata_raw: str | None = None
+    provided_hash: str | None = None
+    provided_hash_exists: bool | None = None
+
+    file_written = 0
+    tmp_path: str | None = None
+    while True:
+        field = await reader.next()
+        if field is None:
+            break
+
+        fname = getattr(field, "name", "") or ""
+
+        if fname == "hash":
+            try:
+                s = ((await field.text()) or "").strip().lower()
+            except Exception:
+                return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
+
+            if s:
+                if ":" not in s:
+                    return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
+                algo, digest = s.split(":", 1)
+                if algo != "blake3" or not digest or any(c for c in digest if c not in "0123456789abcdef"):
+                    return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
+                provided_hash = f"{algo}:{digest}"
+                try:
+                    provided_hash_exists = manager.asset_exists(asset_hash=provided_hash)
+                except Exception:
+                    provided_hash_exists = None  # do not fail the whole request here
+
+        elif fname == "file":
+            file_present = True
+            file_client_name = (field.filename or "").strip()
+
+            if provided_hash and provided_hash_exists is True:
+                # If client supplied a hash that we know exists, drain but do not write to disk
+                try:
+                    while True:
+                        chunk = await field.read_chunk(8 * 1024 * 1024)
+                        if not chunk:
+                            break
+                        file_written += len(chunk)
+                except Exception:
+                    return _error_response(500, "UPLOAD_IO_ERROR", "Failed to receive uploaded file.")
+                continue  # Do not create temp file; we will create AssetInfo from the existing content
+
+            # Otherwise, store to temp for hashing/ingest
+            uploads_root = os.path.join(folder_paths.get_temp_directory(), "uploads")
+            unique_dir = os.path.join(uploads_root, uuid.uuid4().hex)
+            os.makedirs(unique_dir, exist_ok=True)
+            tmp_path = os.path.join(unique_dir, ".upload.part")
+
+            try:
+                with open(tmp_path, "wb") as f:
+                    while True:
+                        chunk = await field.read_chunk(8 * 1024 * 1024)
+                        if not chunk:
+                            break
+                        f.write(chunk)
+                        file_written += len(chunk)
+            except Exception:
+                try:
+                    if os.path.exists(tmp_path or ""):
+                        os.remove(tmp_path)
+                finally:
+                    return _error_response(500, "UPLOAD_IO_ERROR", "Failed to receive and store uploaded file.")
+        elif fname == "tags":
+            tags_raw.append((await field.text()) or "")
+        elif fname == "name":
+            provided_name = (await field.text()) or None
+        elif fname == "user_metadata":
+            user_metadata_raw = (await field.text()) or None
+
+    # If client did not send file, and we are not doing a from-hash fast path -> error
+    if not file_present and not (provided_hash and provided_hash_exists):
+        return _error_response(400, "MISSING_FILE", "Form must include a 'file' part or a known 'hash'.")
+
+    if file_present and file_written == 0 and not (provided_hash and provided_hash_exists):
+        # Empty upload is only acceptable if we are fast-pathing from existing hash
+        try:
+            if tmp_path and os.path.exists(tmp_path):
+                os.remove(tmp_path)
+        finally:
+            return _error_response(400, "EMPTY_UPLOAD", "Uploaded file is empty.")
+
+    try:
+        spec = schemas_in.UploadAssetSpec.model_validate({
+            "tags": tags_raw,
+            "name": provided_name,
+            "user_metadata": user_metadata_raw,
+            "hash": provided_hash,
+        })
+    except ValidationError as ve:
+        try:
+            if tmp_path and os.path.exists(tmp_path):
+                os.remove(tmp_path)
+        finally:
+            return _validation_error_response("INVALID_BODY", ve)
+
+    # Validate models category against configured folders (consistent with previous behavior)
+    if spec.tags and spec.tags[0] == "models":
+        if len(spec.tags) < 2 or spec.tags[1] not in folder_paths.folder_names_and_paths:
+            if tmp_path and os.path.exists(tmp_path):
+                os.remove(tmp_path)
+            return _error_response(
+                400, "INVALID_BODY", f"unknown models category '{spec.tags[1] if len(spec.tags) >= 2 else ''}'"
+            )
+
+    owner_id = USER_MANAGER.get_request_user_id(request)
+
+    # Fast path: if a valid provided hash exists, create AssetInfo without writing anything
+    if spec.hash and provided_hash_exists is True:
+        try:
+            result = manager.create_asset_from_hash(
+                hash_str=spec.hash,
+                name=spec.name or (spec.hash.split(":", 1)[1]),
+                tags=spec.tags,
+                user_metadata=spec.user_metadata or {},
+                owner_id=owner_id,
+            )
+        except Exception:
+            logging.exception("create_asset_from_hash failed for hash=%s, owner_id=%s", spec.hash, owner_id)
+            return _error_response(500, "INTERNAL", "Unexpected server error.")
+
+        if result is None:
+            return _error_response(404, "ASSET_NOT_FOUND", f"Asset content {spec.hash} does not exist")
+
+        # Drain temp if we accidentally saved (e.g., hash field came after file)
+        if tmp_path and os.path.exists(tmp_path):
+            with contextlib.suppress(Exception):
+                os.remove(tmp_path)
+
+        status = 200 if (not result.created_new) else 201
+        return web.json_response(result.model_dump(mode="json"), status=status)
+
+    # Otherwise, we must have a temp file path to ingest
+    if not tmp_path or not os.path.exists(tmp_path):
+        # The only case we reach here without a temp file is: client sent a hash that does not exist and no file
+        return _error_response(404, "ASSET_NOT_FOUND", "Provided hash not found and no file uploaded.")
+
+    try:
+        created = manager.upload_asset_from_temp_path(
+            spec,
+            temp_path=tmp_path,
+            client_filename=file_client_name,
+            owner_id=owner_id,
+            expected_asset_hash=spec.hash,
+        )
+        status = 201 if created.created_new else 200
+        return web.json_response(created.model_dump(mode="json"), status=status)
+    except ValueError as e:
+        if tmp_path and os.path.exists(tmp_path):
+            os.remove(tmp_path)
+        msg = str(e)
+        if "HASH_MISMATCH" in msg or msg.strip().upper() == "HASH_MISMATCH":
+            return _error_response(
+                400,
+                "HASH_MISMATCH",
+                "Uploaded file hash does not match provided hash.",
+            )
+        return _error_response(400, "BAD_REQUEST", "Invalid inputs.")
+    except Exception:
+        if tmp_path and os.path.exists(tmp_path):
+            os.remove(tmp_path)
+        logging.exception("upload_asset_from_temp_path failed for tmp_path=%s, owner_id=%s", tmp_path, owner_id)
+        return _error_response(500, "INTERNAL", "Unexpected server error.")
+
+
+@ROUTES.put(f"/api/assets/{{id:{UUID_RE}}}")
+async def update_asset(request: web.Request) -> web.Response:
+    asset_info_id = str(uuid.UUID(request.match_info["id"]))
+    try:
+        body = schemas_in.UpdateAssetBody.model_validate(await request.json())
+    except ValidationError as ve:
+        return _validation_error_response("INVALID_BODY", ve)
+    except Exception:
+        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")
+
+    try:
+        result = manager.update_asset(
+            asset_info_id=asset_info_id,
+            name=body.name,
+            user_metadata=body.user_metadata,
+            owner_id=USER_MANAGER.get_request_user_id(request),
+        )
+    except (ValueError, PermissionError) as ve:
+        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
+    except Exception:
+        logging.exception(
+            "update_asset failed for asset_info_id=%s, owner_id=%s",
+            asset_info_id,
+            USER_MANAGER.get_request_user_id(request),
+        )
+        return _error_response(500, "INTERNAL", "Unexpected server error.")
+    return web.json_response(result.model_dump(mode="json"), status=200)
+
+
+@ROUTES.delete(f"/api/assets/{{id:{UUID_RE}}}")
+async def delete_asset(request: web.Request) -> web.Response:
+    asset_info_id = str(uuid.UUID(request.match_info["id"]))
+    delete_content = request.query.get("delete_content")
+    delete_content = True if delete_content is None else delete_content.lower() not in {"0", "false", "no"}
+
+    try:
+        deleted = manager.delete_asset_reference(
+            asset_info_id=asset_info_id,
+            owner_id=USER_MANAGER.get_request_user_id(request),
+            delete_content_if_orphan=delete_content,
+        )
+    except Exception:
+        logging.exception(
+            "delete_asset_reference failed for asset_info_id=%s, owner_id=%s",
+            asset_info_id,
+            USER_MANAGER.get_request_user_id(request),
+        )
+        return _error_response(500, "INTERNAL", "Unexpected server error.")
+
+    if not deleted:
+        return _error_response(404, "ASSET_NOT_FOUND", f"AssetInfo {asset_info_id} not found.")
+    return web.Response(status=204)
+
+
@ROUTES.get("/api/tags")
 async def get_tags(request: web.Request) -> web.Response:
    """
@@ -100,3 +429,86 @@ async def get_tags(request: web.Request) -> web.Response:
        owner_id=USER_MANAGER.get_request_user_id(request),
    )
    return web.json_response(result.model_dump(mode="json"))
+
+
+@ROUTES.post(f"/api/assets/{{id:{UUID_RE}}}/tags")
+async def add_asset_tags(request: web.Request) -> web.Response:
+    asset_info_id = str(uuid.UUID(request.match_info["id"]))
+    try:
+        payload = await request.json()
+        data = schemas_in.TagsAdd.model_validate(payload)
+    except ValidationError as ve:
+        return _error_response(400, "INVALID_BODY", "Invalid JSON body for tags add.", {"errors": ve.errors()})
+    except Exception:
+        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")
+
+    try:
+        result = manager.add_tags_to_asset(
+            asset_info_id=asset_info_id,
+            tags=data.tags,
+            origin="manual",
+            owner_id=USER_MANAGER.get_request_user_id(request),
+        )
+    except (ValueError, PermissionError) as ve:
+        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
+    except Exception:
+        logging.exception(
+            "add_tags_to_asset failed for asset_info_id=%s, owner_id=%s",
+            asset_info_id,
+            USER_MANAGER.get_request_user_id(request),
+        )
+        return _error_response(500, "INTERNAL", "Unexpected server error.")
+
+    return web.json_response(result.model_dump(mode="json"), status=200)
+
+
+@ROUTES.delete(f"/api/assets/{{id:{UUID_RE}}}/tags")
+async def delete_asset_tags(request: web.Request) -> web.Response:
+    asset_info_id = str(uuid.UUID(request.match_info["id"]))
+    try:
+        payload = await request.json()
+        data = schemas_in.TagsRemove.model_validate(payload)
+    except ValidationError as ve:
+        return _error_response(400, "INVALID_BODY", "Invalid JSON body for tags remove.", {"errors": ve.errors()})
+    except Exception:
+        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")
+
+    try:
+        result = manager.remove_tags_from_asset(
+            asset_info_id=asset_info_id,
+            tags=data.tags,
+            owner_id=USER_MANAGER.get_request_user_id(request),
+        )
+    except ValueError as ve:
+        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
+    except Exception:
+        logging.exception(
+            "remove_tags_from_asset failed for asset_info_id=%s, owner_id=%s",
+            asset_info_id,
+            USER_MANAGER.get_request_user_id(request),
+        )
+        return _error_response(500, "INTERNAL", "Unexpected server error.")
+
+    return web.json_response(result.model_dump(mode="json"), status=200)
+
+
+@ROUTES.post("/api/assets/seed")
+async def seed_assets_endpoint(request: web.Request) -> web.Response:
+    """Trigger asset seeding for specified roots (models, input, output)."""
+    try:
+        payload = await request.json()
+        roots = payload.get("roots", ["models", "input", "output"])
+    except Exception:
+        roots = ["models", "input", "output"]
+
+    valid_roots = [r for r in roots if r in ("models", "input", "output")]
+    if not valid_roots:
+        return _error_response(400, "INVALID_BODY", "No valid roots specified")
+
+    try:
+        seed_assets(tuple(valid_roots))
+    except Exception:
+        logging.exception("seed_assets failed for roots=%s", valid_roots)
+        return _error_response(500, "INTERNAL", "Seed operation failed")
+
+    return web.json_response({"seeded": valid_roots}, status=200)
@@ -1,5 +1,4 @@
 import json
-import uuid
 from typing import Any, Literal

 from pydantic import (
@@ -8,9 +7,9 @@ from pydantic import (
    Field,
    conint,
    field_validator,
+    model_validator,
 )

-
 class ListAssetsQuery(BaseModel):
    include_tags: list[str] = Field(default_factory=list)
    exclude_tags: list[str] = Field(default_factory=list)
@@ -57,6 +56,57 @@ class ListAssetsQuery(BaseModel):
        return None


+class UpdateAssetBody(BaseModel):
+    name: str | None = None
+    user_metadata: dict[str, Any] | None = None
+
+    @model_validator(mode="after")
+    def _at_least_one(self):
+        if self.name is None and self.user_metadata is None:
+            raise ValueError("Provide at least one of: name, user_metadata.")
+        return self
+
+
+class CreateFromHashBody(BaseModel):
+    model_config = ConfigDict(extra="ignore", str_strip_whitespace=True)
+
+    hash: str
+    name: str
+    tags: list[str] = Field(default_factory=list)
+    user_metadata: dict[str, Any] = Field(default_factory=dict)
+
+    @field_validator("hash")
+    @classmethod
+    def _require_blake3(cls, v):
+        s = (v or "").strip().lower()
+        if ":" not in s:
+            raise ValueError("hash must be 'blake3:<hex>'")
+        algo, digest = s.split(":", 1)
+        if algo != "blake3":
+            raise ValueError("only canonical 'blake3:<hex>' is accepted here")
+        if not digest or any(c for c in digest if c not in "0123456789abcdef"):
+            raise ValueError("hash digest must be lowercase hex")
+        return s
+
+    @field_validator("tags", mode="before")
+    @classmethod
+    def _tags_norm(cls, v):
+        if v is None:
+            return []
+        if isinstance(v, list):
+            out = [str(t).strip().lower() for t in v if str(t).strip()]
+            seen = set()
+            dedup = []
+            for t in out:
+                if t not in seen:
+                    seen.add(t)
+                    dedup.append(t)
+            return dedup
+        if isinstance(v, str):
+            return [t.strip().lower() for t in v.split(",") if t.strip()]
+        return []
+
+
 class TagsListQuery(BaseModel):
    model_config = ConfigDict(extra="ignore", str_strip_whitespace=True)

@@ -75,20 +125,140 @@ class TagsListQuery(BaseModel):
        return v.lower() or None


-class SetPreviewBody(BaseModel):
-    """Set or clear the preview for an AssetInfo. Provide an Asset.id or null."""
-    preview_id: str | None = None
+class TagsAdd(BaseModel):
+    model_config = ConfigDict(extra="ignore")
+    tags: list[str] = Field(..., min_length=1)

-    @field_validator("preview_id", mode="before")
+    @field_validator("tags")
    @classmethod
-    def _norm_uuid(cls, v):
+    def normalize_tags(cls, v: list[str]) -> list[str]:
+        out = []
+        for t in v:
+            if not isinstance(t, str):
+                raise TypeError("tags must be strings")
+            tnorm = t.strip().lower()
+            if tnorm:
+                out.append(tnorm)
+        seen = set()
+        deduplicated = []
+        for x in out:
+            if x not in seen:
+                seen.add(x)
+                deduplicated.append(x)
+        return deduplicated
+
+
+class TagsRemove(TagsAdd):
+    pass
+
+
+class UploadAssetSpec(BaseModel):
+    """Upload Asset operation.
+    - tags: ordered; first is root ('models'|'input'|'output');
+            if root == 'models', second must be a valid category from folder_paths.folder_names_and_paths
+    - name: display name
+    - user_metadata: arbitrary JSON object (optional)
+    - hash: optional canonical 'blake3:<hex>' provided by the client for validation / fast-path
+
+    Files created via this endpoint are stored on disk using the **content hash** as the filename stem
+    and the original extension is preserved when available.
+    """
+    model_config = ConfigDict(extra="ignore", str_strip_whitespace=True)
+
+    tags: list[str] = Field(..., min_length=1)
+    name: str | None = Field(default=None, max_length=512, description="Display Name")
+    user_metadata: dict[str, Any] = Field(default_factory=dict)
+    hash: str | None = Field(default=None)
+
+    @field_validator("hash", mode="before")
+    @classmethod
+    def _parse_hash(cls, v):
        if v is None:
            return None
-        s = str(v).strip()
+        s = str(v).strip().lower()
        if not s:
            return None
-        try:
-            uuid.UUID(s)
-        except Exception:
-            raise ValueError("preview_id must be a UUID")
-        return s
+        if ":" not in s:
+            raise ValueError("hash must be 'blake3:<hex>'")
+        algo, digest = s.split(":", 1)
+        if algo != "blake3":
+            raise ValueError("only canonical 'blake3:<hex>' is accepted here")
+        if not digest or any(c for c in digest if c not in "0123456789abcdef"):
+            raise ValueError("hash digest must be lowercase hex")
+        return f"{algo}:{digest}"
+
+    @field_validator("tags", mode="before")
+    @classmethod
+    def _parse_tags(cls, v):
+        """
+        Accepts a list of strings (possibly multiple form fields),
+        where each string can be:
+          - JSON array (e.g., '["models","loras","foo"]')
+          - comma-separated ('models, loras, foo')
+          - single token ('models')
+        Returns a normalized, deduplicated, ordered list.
+        """
+        items: list[str] = []
+        if v is None:
+            return []
+        if isinstance(v, str):
+            v = [v]
+
+        if isinstance(v, list):
+            for item in v:
+                if item is None:
+                    continue
+                s = str(item).strip()
+                if not s:
+                    continue
+                if s.startswith("["):
+                    try:
+                        arr = json.loads(s)
+                        if isinstance(arr, list):
+                            items.extend(str(x) for x in arr)
+                            continue
+                    except Exception:
+                        pass  # fallback to CSV parse below
+                items.extend([p for p in s.split(",") if p.strip()])
+        else:
+            return []
+
+        # normalize + dedupe
+        norm = []
+        seen = set()
+        for t in items:
+            tnorm = str(t).strip().lower()
+            if tnorm and tnorm not in seen:
+                seen.add(tnorm)
+                norm.append(tnorm)
+        return norm
+
+    @field_validator("user_metadata", mode="before")
+    @classmethod
+    def _parse_metadata_json(cls, v):
+        if v is None or isinstance(v, dict):
+            return v or {}
+        if isinstance(v, str):
+            s = v.strip()
+            if not s:
+                return {}
+            try:
+                parsed = json.loads(s)
+            except Exception as e:
+                raise ValueError(f"user_metadata must be JSON: {e}") from e
+            if not isinstance(parsed, dict):
+                raise ValueError("user_metadata must be a JSON object")
+            return parsed
+        return {}
+
+    @model_validator(mode="after")
+    def _validate_order(self):
+        if not self.tags:
+            raise ValueError("tags must be provided and non-empty")
+        root = self.tags[0]
+        if root not in {"models", "input", "output"}:
+            raise ValueError("first tag must be one of: models, input, output")
+        if root == "models":
+            if len(self.tags) < 2:
+                raise ValueError("models uploads require a category tag as the second tag")
+        return self
@@ -29,6 +29,21 @@ class AssetsList(BaseModel):
    has_more: bool


+class AssetUpdated(BaseModel):
+    id: str
+    name: str
+    asset_hash: str | None = None
+    tags: list[str] = Field(default_factory=list)
+    user_metadata: dict[str, Any] = Field(default_factory=dict)
+    updated_at: datetime | None = None
+
+    model_config = ConfigDict(from_attributes=True)
+
+    @field_serializer("updated_at")
+    def _ser_updated(self, v: datetime | None, _info):
+        return v.isoformat() if v else None
+
+
 class AssetDetail(BaseModel):
    id: str
    name: str
@@ -48,6 +63,10 @@ class AssetDetail(BaseModel):
        return v.isoformat() if v else None


+class AssetCreated(AssetDetail):
+    created_new: bool
+
+
 class TagUsage(BaseModel):
    name: str
    count: int
@@ -58,3 +77,17 @@ class TagsList(BaseModel):
    tags: list[TagUsage] = Field(default_factory=list)
    total: int
    has_more: bool
+
+
+class TagsAdd(BaseModel):
+    model_config = ConfigDict(str_strip_whitespace=True)
+    added: list[str] = Field(default_factory=list)
+    already_present: list[str] = Field(default_factory=list)
+    total_tags: list[str] = Field(default_factory=list)
+
+
+class TagsRemove(BaseModel):
+    model_config = ConfigDict(str_strip_whitespace=True)
+    removed: list[str] = Field(default_factory=list)
+    not_present: list[str] = Field(default_factory=list)
+    total_tags: list[str] = Field(default_factory=list)
@@ -1,9 +1,17 @@
+import os
+import logging
 import sqlalchemy as sa
 from collections import defaultdict
-from sqlalchemy import select, exists, func
+from datetime import datetime
+from typing import Iterable, Any
+from sqlalchemy import select, delete, exists, func
+from sqlalchemy.dialects import sqlite
+from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session, contains_eager, noload
-from app.assets.database.models import Asset, AssetInfo, AssetInfoMeta, AssetInfoTag, Tag
-from app.assets.helpers import escape_like_prefix, normalize_tags
+from app.assets.database.models import Asset, AssetInfo, AssetCacheState, AssetInfoMeta, AssetInfoTag, Tag
+from app.assets.helpers import (
+    compute_relative_filename, escape_like_prefix, normalize_tags, project_kv, utcnow
+)
 from typing import Sequence


@@ -15,6 +23,22 @@ def visible_owner_clause(owner_id: str) -> sa.sql.ClauseElement:
    return AssetInfo.owner_id.in_(["", owner_id])


+def pick_best_live_path(states: Sequence[AssetCacheState]) -> str:
+    """
+    Return the best on-disk path among cache states:
+      1) Prefer a path that exists with needs_verify == False (already verified).
+      2) Otherwise, pick the first path that exists.
+      3) Otherwise return empty string.
+    """
+    alive = [s for s in states if getattr(s, "file_path", None) and os.path.isfile(s.file_path)]
+    if not alive:
+        return ""
+    for s in alive:
+        if not getattr(s, "needs_verify", False):
+            return s.file_path
+    return alive[0].file_path
+
+
 def apply_tag_filters(
    stmt: sa.sql.Select,
    include_tags: Sequence[str] | None = None,
@@ -42,6 +66,7 @@ def apply_tag_filters(
        )
    return stmt

+
 def apply_metadata_filter(
    stmt: sa.sql.Select,
    metadata_filter: dict | None = None,
@@ -94,7 +119,11 @@ def apply_metadata_filter(
    return stmt


-def asset_exists_by_hash(session: Session, asset_hash: str) -> bool:
+def asset_exists_by_hash(
+    session: Session,
+    *,
+    asset_hash: str,
+) -> bool:
    """
    Check if an asset with a given hash exists in database.
    """
@@ -105,9 +134,39 @@ def asset_exists_by_hash(session: Session, asset_hash: str) -> bool:
    ).first()
    return row is not None

-def get_asset_info_by_id(session: Session, asset_info_id: str) -> AssetInfo | None:
+
+def asset_info_exists_for_asset_id(
+    session: Session,
+    *,
+    asset_id: str,
+) -> bool:
+    q = (
+        select(sa.literal(True))
+        .select_from(AssetInfo)
+        .where(AssetInfo.asset_id == asset_id)
+        .limit(1)
+    )
+    return (session.execute(q)).first() is not None
+
+
+def get_asset_by_hash(
+    session: Session,
+    *,
+    asset_hash: str,
+) -> Asset | None:
+    return (
+        session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1))
+    ).scalars().first()
+
+
+def get_asset_info_by_id(
+    session: Session,
+    *,
+    asset_info_id: str,
+) -> AssetInfo | None:
    return session.get(AssetInfo, asset_info_id)

+
 def list_asset_infos_page(
    session: Session,
    owner_id: str = "",
@@ -171,12 +230,14 @@ def list_asset_infos_page(
            select(AssetInfoTag.asset_info_id, Tag.name)
            .join(Tag, Tag.name == AssetInfoTag.tag_name)
            .where(AssetInfoTag.asset_info_id.in_(id_list))
+            .order_by(AssetInfoTag.added_at)
        )
        for aid, tag_name in rows.all():
            tag_map[aid].append(tag_name)

    return infos, tag_map, total

+
 def fetch_asset_info_asset_and_tags(
    session: Session,
    asset_info_id: str,
@@ -208,6 +269,494 @@ def fetch_asset_info_asset_and_tags(
            tags.append(tag_name)
    return first_info, first_asset, tags

+
+def fetch_asset_info_and_asset(
+    session: Session,
+    *,
+    asset_info_id: str,
+    owner_id: str = "",
+) -> tuple[AssetInfo, Asset] | None:
+    stmt = (
+        select(AssetInfo, Asset)
+        .join(Asset, Asset.id == AssetInfo.asset_id)
+        .where(
+            AssetInfo.id == asset_info_id,
+            visible_owner_clause(owner_id),
+        )
+        .limit(1)
+        .options(noload(AssetInfo.tags))
+    )
+    row = session.execute(stmt)
+    pair = row.first()
+    if not pair:
+        return None
+    return pair[0], pair[1]
+
+def list_cache_states_by_asset_id(
+    session: Session, *, asset_id: str
+) -> Sequence[AssetCacheState]:
+    return (
+        session.execute(
+            select(AssetCacheState)
+            .where(AssetCacheState.asset_id == asset_id)
+            .order_by(AssetCacheState.id.asc())
+        )
+    ).scalars().all()
+
+
+def touch_asset_info_by_id(
+    session: Session,
+    *,
+    asset_info_id: str,
+    ts: datetime | None = None,
+    only_if_newer: bool = True,
+) -> None:
+    ts = ts or utcnow()
+    stmt = sa.update(AssetInfo).where(AssetInfo.id == asset_info_id)
+    if only_if_newer:
+        stmt = stmt.where(
+            sa.or_(AssetInfo.last_access_time.is_(None), AssetInfo.last_access_time < ts)
+        )
+    session.execute(stmt.values(last_access_time=ts))
+
+
+def create_asset_info_for_existing_asset(
+    session: Session,
+    *,
+    asset_hash: str,
+    name: str,
+    user_metadata: dict | None = None,
+    tags: Sequence[str] | None = None,
+    tag_origin: str = "manual",
+    owner_id: str = "",
+) -> AssetInfo:
+    """Create or return an existing AssetInfo for an Asset identified by asset_hash."""
+    now = utcnow()
+    asset = get_asset_by_hash(session, asset_hash=asset_hash)
+    if not asset:
+        raise ValueError(f"Unknown asset hash {asset_hash}")
+
+    info = AssetInfo(
+        owner_id=owner_id,
+        name=name,
+        asset_id=asset.id,
+        preview_id=None,
+        created_at=now,
+        updated_at=now,
+        last_access_time=now,
+    )
+    try:
+        with session.begin_nested():
+            session.add(info)
+            session.flush()
+    except IntegrityError:
+        existing = (
+            session.execute(
+                select(AssetInfo)
+                .options(noload(AssetInfo.tags))
+                .where(
+                    AssetInfo.asset_id == asset.id,
+                    AssetInfo.name == name,
+                    AssetInfo.owner_id == owner_id,
+                )
+                .limit(1)
+            )
+        ).unique().scalars().first()
+        if not existing:
+            raise RuntimeError("AssetInfo upsert failed to find existing row after conflict.")
+        return existing
+
+    # metadata["filename"] hack
+    new_meta = dict(user_metadata or {})
+    computed_filename = None
+    try:
+        p = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=asset.id))
+        if p:
+            computed_filename = compute_relative_filename(p)
+    except Exception:
+        computed_filename = None
+    if computed_filename:
+        new_meta["filename"] = computed_filename
+    if new_meta:
+        replace_asset_info_metadata_projection(
+            session,
+            asset_info_id=info.id,
+            user_metadata=new_meta,
+        )
+
+    if tags is not None:
+        set_asset_info_tags(
+            session,
+            asset_info_id=info.id,
+            tags=tags,
+            origin=tag_origin,
+        )
+    return info
+
+
+def set_asset_info_tags(
+    session: Session,
+    *,
+    asset_info_id: str,
+    tags: Sequence[str],
+    origin: str = "manual",
+) -> dict:
+    desired = normalize_tags(tags)
+
+    current = set(
+        tag_name for (tag_name,) in (
+            session.execute(select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id))
+        ).all()
+    )
+
+    to_add = [t for t in desired if t not in current]
+    to_remove = [t for t in current if t not in desired]
+
+    if to_add:
+        ensure_tags_exist(session, to_add, tag_type="user")
+        session.add_all([
+            AssetInfoTag(asset_info_id=asset_info_id, tag_name=t, origin=origin, added_at=utcnow())
+            for t in to_add
+        ])
+        session.flush()
+
+    if to_remove:
+        session.execute(
+            delete(AssetInfoTag)
+            .where(AssetInfoTag.asset_info_id == asset_info_id, AssetInfoTag.tag_name.in_(to_remove))
+        )
+        session.flush()
+
+    return {"added": to_add, "removed": to_remove, "total": desired}
+
+
+def replace_asset_info_metadata_projection(
+    session: Session,
+    *,
+    asset_info_id: str,
+    user_metadata: dict | None = None,
+) -> None:
+    info = session.get(AssetInfo, asset_info_id)
+    if not info:
+        raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    info.user_metadata = user_metadata or {}
+    info.updated_at = utcnow()
+    session.flush()
+
+    session.execute(delete(AssetInfoMeta).where(AssetInfoMeta.asset_info_id == asset_info_id))
+    session.flush()
+
+    if not user_metadata:
+        return
+
+    rows: list[AssetInfoMeta] = []
+    for k, v in user_metadata.items():
+        for r in project_kv(k, v):
+            rows.append(
+                AssetInfoMeta(
+                    asset_info_id=asset_info_id,
+                    key=r["key"],
+                    ordinal=int(r["ordinal"]),
+                    val_str=r.get("val_str"),
+                    val_num=r.get("val_num"),
+                    val_bool=r.get("val_bool"),
+                    val_json=r.get("val_json"),
+                )
+            )
+    if rows:
+        session.add_all(rows)
+        session.flush()
+
+
+def ingest_fs_asset(
+    session: Session,
+    *,
+    asset_hash: str,
+    abs_path: str,
+    size_bytes: int,
+    mtime_ns: int,
+    mime_type: str | None = None,
+    info_name: str | None = None,
+    owner_id: str = "",
+    preview_id: str | None = None,
+    user_metadata: dict | None = None,
+    tags: Sequence[str] = (),
+    tag_origin: str = "manual",
+    require_existing_tags: bool = False,
+) -> dict:
+    """
+    Idempotently upsert:
+      - Asset by content hash (create if missing)
+      - AssetCacheState(file_path) pointing to asset_id
+      - Optionally AssetInfo + tag links and metadata projection
+    Returns flags and ids.
+    """
+    locator = os.path.abspath(abs_path)
+    now = utcnow()
+
+    if preview_id:
+        if not session.get(Asset, preview_id):
+            preview_id = None
+
+    out: dict[str, Any] = {
+        "asset_created": False,
+        "asset_updated": False,
+        "state_created": False,
+        "state_updated": False,
+        "asset_info_id": None,
+    }
+
+    # 1) Asset by hash
+    asset = (
+        session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1))
+    ).scalars().first()
+    if not asset:
+        vals = {
+            "hash": asset_hash,
+            "size_bytes": int(size_bytes),
+            "mime_type": mime_type,
+            "created_at": now,
+        }
+        res = session.execute(
+            sqlite.insert(Asset)
+            .values(**vals)
+            .on_conflict_do_nothing(index_elements=[Asset.hash])
+        )
+        if int(res.rowcount or 0) > 0:
+            out["asset_created"] = True
+        asset = (
+            session.execute(
+                select(Asset).where(Asset.hash == asset_hash).limit(1)
+            )
+        ).scalars().first()
+        if not asset:
+            raise RuntimeError("Asset row not found after upsert.")
+    else:
+        changed = False
+        if asset.size_bytes != int(size_bytes) and int(size_bytes) > 0:
+            asset.size_bytes = int(size_bytes)
+            changed = True
+        if mime_type and asset.mime_type != mime_type:
+            asset.mime_type = mime_type
+            changed = True
+        if changed:
+            out["asset_updated"] = True
+
+    # 2) AssetCacheState upsert by file_path (unique)
+    vals = {
+        "asset_id": asset.id,
+        "file_path": locator,
+        "mtime_ns": int(mtime_ns),
+    }
+    ins = (
+        sqlite.insert(AssetCacheState)
+        .values(**vals)
+        .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
+    )
+
+    res = session.execute(ins)
+    if int(res.rowcount or 0) > 0:
+        out["state_created"] = True
+    else:
+        upd = (
+            sa.update(AssetCacheState)
+            .where(AssetCacheState.file_path == locator)
+            .where(
+                sa.or_(
+                    AssetCacheState.asset_id != asset.id,
+                    AssetCacheState.mtime_ns.is_(None),
+                    AssetCacheState.mtime_ns != int(mtime_ns),
+                )
+            )
+            .values(asset_id=asset.id, mtime_ns=int(mtime_ns))
+        )
+        res2 = session.execute(upd)
+        if int(res2.rowcount or 0) > 0:
+            out["state_updated"] = True
+
+    # 3) Optional AssetInfo + tags + metadata
+    if info_name:
+        try:
+            with session.begin_nested():
+                info = AssetInfo(
+                    owner_id=owner_id,
+                    name=info_name,
+                    asset_id=asset.id,
+                    preview_id=preview_id,
+                    created_at=now,
+                    updated_at=now,
+                    last_access_time=now,
+                )
+                session.add(info)
+                session.flush()
+                out["asset_info_id"] = info.id
+        except IntegrityError:
+            pass
+
+        existing_info = (
+            session.execute(
+                select(AssetInfo)
+                .where(
+                    AssetInfo.asset_id == asset.id,
+                    AssetInfo.name == info_name,
+                    (AssetInfo.owner_id == owner_id),
+                )
+                .limit(1)
+            )
+        ).unique().scalar_one_or_none()
+        if not existing_info:
+            raise RuntimeError("Failed to update or insert AssetInfo.")
+
+        if preview_id and existing_info.preview_id != preview_id:
+            existing_info.preview_id = preview_id
+
+        existing_info.updated_at = now
+        if existing_info.last_access_time < now:
+            existing_info.last_access_time = now
+        session.flush()
+        out["asset_info_id"] = existing_info.id
+
+        norm = [t.strip().lower() for t in (tags or []) if (t or "").strip()]
+        if norm and out["asset_info_id"] is not None:
+            if not require_existing_tags:
+                ensure_tags_exist(session, norm, tag_type="user")
+
+            existing_tag_names = set(
+                name for (name,) in (session.execute(select(Tag.name).where(Tag.name.in_(norm)))).all()
+            )
+            missing = [t for t in norm if t not in existing_tag_names]
+            if missing and require_existing_tags:
+                raise ValueError(f"Unknown tags: {missing}")
+
+            existing_links = set(
+                tag_name
+                for (tag_name,) in (
+                    session.execute(
+                        select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == out["asset_info_id"])
+                    )
+                ).all()
+            )
+            to_add = [t for t in norm if t in existing_tag_names and t not in existing_links]
+            if to_add:
+                session.add_all(
+                    [
+                        AssetInfoTag(
+                            asset_info_id=out["asset_info_id"],
+                            tag_name=t,
+                            origin=tag_origin,
+                            added_at=now,
+                        )
+                        for t in to_add
+                    ]
+                )
+                session.flush()
+
+        # metadata["filename"] hack
+        if out["asset_info_id"] is not None:
+            primary_path = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=asset.id))
+            computed_filename = compute_relative_filename(primary_path) if primary_path else None
+
+            current_meta = existing_info.user_metadata or {}
+            new_meta = dict(current_meta)
+            if user_metadata is not None:
+                for k, v in user_metadata.items():
+                    new_meta[k] = v
+            if computed_filename:
+                new_meta["filename"] = computed_filename
+
+            if new_meta != current_meta:
+                replace_asset_info_metadata_projection(
+                    session,
+                    asset_info_id=out["asset_info_id"],
+                    user_metadata=new_meta,
+                )
+
+    try:
+        remove_missing_tag_for_asset_id(session, asset_id=asset.id)
+    except Exception:
+        logging.exception("Failed to clear 'missing' tag for asset %s", asset.id)
+    return out
+
+
+def update_asset_info_full(
+    session: Session,
+    *,
+    asset_info_id: str,
+    name: str | None = None,
+    tags: Sequence[str] | None = None,
+    user_metadata: dict | None = None,
+    tag_origin: str = "manual",
+    asset_info_row: Any = None,
+) -> AssetInfo:
+    if not asset_info_row:
+        info = session.get(AssetInfo, asset_info_id)
+        if not info:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+    else:
+        info = asset_info_row
+
+    touched = False
+    if name is not None and name != info.name:
+        info.name = name
+        touched = True
+
+    computed_filename = None
+    try:
+        p = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=info.asset_id))
+        if p:
+            computed_filename = compute_relative_filename(p)
+    except Exception:
+        computed_filename = None
+
+    if user_metadata is not None:
+        new_meta = dict(user_metadata)
+        if computed_filename:
+            new_meta["filename"] = computed_filename
+        replace_asset_info_metadata_projection(
+            session, asset_info_id=asset_info_id, user_metadata=new_meta
+        )
+        touched = True
+    else:
+        if computed_filename:
+            current_meta = info.user_metadata or {}
+            if current_meta.get("filename") != computed_filename:
+                new_meta = dict(current_meta)
+                new_meta["filename"] = computed_filename
+                replace_asset_info_metadata_projection(
+                    session, asset_info_id=asset_info_id, user_metadata=new_meta
+                )
+                touched = True
+
+    if tags is not None:
+        set_asset_info_tags(
+            session,
+            asset_info_id=asset_info_id,
+            tags=tags,
+            origin=tag_origin,
+        )
+        touched = True
+
+    if touched and user_metadata is None:
+        info.updated_at = utcnow()
+        session.flush()
+
+    return info
+
+
+def delete_asset_info_by_id(
+    session: Session,
+    *,
+    asset_info_id: str,
+    owner_id: str,
+) -> bool:
+    stmt = sa.delete(AssetInfo).where(
+        AssetInfo.id == asset_info_id,
+        visible_owner_clause(owner_id),
+    )
+    return int((session.execute(stmt)).rowcount or 0) > 0
+
+
 def list_tags_with_usage(
    session: Session,
    prefix: str | None = None,
@@ -265,3 +814,163 @@ def list_tags_with_usage(

    rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows]
    return rows_norm, int(total or 0)
+
+
+def ensure_tags_exist(session: Session, names: Iterable[str], tag_type: str = "user") -> None:
+    wanted = normalize_tags(list(names))
+    if not wanted:
+        return
+    rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
+    ins = (
+        sqlite.insert(Tag)
+        .values(rows)
+        .on_conflict_do_nothing(index_elements=[Tag.name])
+    )
+    session.execute(ins)
+
+
+def get_asset_tags(session: Session, *, asset_info_id: str) -> list[str]:
+    return [
+        tag_name for (tag_name,) in (
+            session.execute(
+                select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
+            )
+        ).all()
+    ]
+
+
+def add_tags_to_asset_info(
+    session: Session,
+    *,
+    asset_info_id: str,
+    tags: Sequence[str],
+    origin: str = "manual",
+    create_if_missing: bool = True,
+    asset_info_row: Any = None,
+) -> dict:
+    if not asset_info_row:
+        info = session.get(AssetInfo, asset_info_id)
+        if not info:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    norm = normalize_tags(tags)
+    if not norm:
+        total = get_asset_tags(session, asset_info_id=asset_info_id)
+        return {"added": [], "already_present": [], "total_tags": total}
+
+    if create_if_missing:
+        ensure_tags_exist(session, norm, tag_type="user")
+
+    current = {
+        tag_name
+        for (tag_name,) in (
+            session.execute(
+                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
+            )
+        ).all()
+    }
+
+    want = set(norm)
+    to_add = sorted(want - current)
+
+    if to_add:
+        with session.begin_nested() as nested:
+            try:
+                session.add_all(
+                    [
+                        AssetInfoTag(
+                            asset_info_id=asset_info_id,
+                            tag_name=t,
+                            origin=origin,
+                            added_at=utcnow(),
+                        )
+                        for t in to_add
+                    ]
+                )
+                session.flush()
+            except IntegrityError:
+                nested.rollback()
+
+    after = set(get_asset_tags(session, asset_info_id=asset_info_id))
+    return {
+        "added": sorted(((after - current) & want)),
+        "already_present": sorted(want & current),
+        "total_tags": sorted(after),
+    }
+
+
+def remove_tags_from_asset_info(
+    session: Session,
+    *,
+    asset_info_id: str,
+    tags: Sequence[str],
+) -> dict:
+    info = session.get(AssetInfo, asset_info_id)
+    if not info:
+        raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    norm = normalize_tags(tags)
+    if not norm:
+        total = get_asset_tags(session, asset_info_id=asset_info_id)
+        return {"removed": [], "not_present": [], "total_tags": total}
+
+    existing = {
+        tag_name
+        for (tag_name,) in (
+            session.execute(
+                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
+            )
+        ).all()
+    }
+
+    to_remove = sorted(set(t for t in norm if t in existing))
+    not_present = sorted(set(t for t in norm if t not in existing))
+
+    if to_remove:
+        session.execute(
+            delete(AssetInfoTag)
+            .where(
+                AssetInfoTag.asset_info_id == asset_info_id,
+                AssetInfoTag.tag_name.in_(to_remove),
+            )
+        )
+        session.flush()
+
+    total = get_asset_tags(session, asset_info_id=asset_info_id)
+    return {"removed": to_remove, "not_present": not_present, "total_tags": total}
+
+
+def remove_missing_tag_for_asset_id(
+    session: Session,
+    *,
+    asset_id: str,
+) -> None:
+    session.execute(
+        sa.delete(AssetInfoTag).where(
+            AssetInfoTag.asset_info_id.in_(sa.select(AssetInfo.id).where(AssetInfo.asset_id == asset_id)),
+            AssetInfoTag.tag_name == "missing",
+        )
+    )
+
+
+def set_asset_info_preview(
+    session: Session,
+    *,
+    asset_info_id: str,
+    preview_asset_id: str | None = None,
+) -> None:
+    """Set or clear preview_id and bump updated_at. Raises on unknown IDs."""
+    info = session.get(AssetInfo, asset_info_id)
+    if not info:
+        raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    if preview_asset_id is None:
+        info.preview_id = None
+    else:
+        # validate preview asset exists
+        if not session.get(Asset, preview_asset_id):
+            raise ValueError(f"Preview Asset {preview_asset_id} not found")
+        info.preview_id = preview_asset_id
+
+    info.updated_at = utcnow()
+    session.flush()
@@ -1,5 +1,6 @@
 import contextlib
 import os
+from decimal import Decimal
 from aiohttp import web
 from datetime import datetime, timezone
 from pathlib import Path
@@ -87,6 +88,40 @@ def get_comfy_models_folders() -> list[tuple[str, list[str]]]:
            targets.append((name, paths))
    return targets

+def resolve_destination_from_tags(tags: list[str]) -> tuple[str, list[str]]:
+    """Validates and maps tags -> (base_dir, subdirs_for_fs)"""
+    root = tags[0]
+    if root == "models":
+        if len(tags) < 2:
+            raise ValueError("at least two tags required for model asset")
+        try:
+            bases = folder_paths.folder_names_and_paths[tags[1]][0]
+        except KeyError:
+            raise ValueError(f"unknown model category '{tags[1]}'")
+        if not bases:
+            raise ValueError(f"no base path configured for category '{tags[1]}'")
+        base_dir = os.path.abspath(bases[0])
+        raw_subdirs = tags[2:]
+    else:
+        base_dir = os.path.abspath(
+            folder_paths.get_input_directory() if root == "input" else folder_paths.get_output_directory()
+        )
+        raw_subdirs = tags[1:]
+    for i in raw_subdirs:
+        if i in (".", ".."):
+            raise ValueError("invalid path component in tags")
+
+    return base_dir, raw_subdirs if raw_subdirs else []
+
+def ensure_within_base(candidate: str, base: str) -> None:
+    cand_abs = os.path.abspath(candidate)
+    base_abs = os.path.abspath(base)
+    try:
+        if os.path.commonpath([cand_abs, base_abs]) != base_abs:
+            raise ValueError("destination escapes base directory")
+    except Exception:
+        raise ValueError("invalid destination path")
+
 def compute_relative_filename(file_path: str) -> str | None:
    """
    Return the model's path relative to the last well-known folder (the model category),
@@ -113,7 +148,6 @@ def compute_relative_filename(file_path: str) -> str | None:
        return "/".join(inside)
    return "/".join(parts)  # input/output: keep all parts

-
 def get_relative_to_root_category_path_of_asset(file_path: str) -> tuple[Literal["input", "output", "models"], str]:
    """Given an absolute or relative file path, determine which root category the path belongs to:
      - 'input' if the file resides under `folder_paths.get_input_directory()`
@@ -215,3 +249,64 @@ def collect_models_files() -> list[str]:
            if allowed:
                out.append(abs_path)
    return out
+
+def is_scalar(v):
+    if v is None:
+        return True
+    if isinstance(v, bool):
+        return True
+    if isinstance(v, (int, float, Decimal, str)):
+        return True
+    return False
+
+def project_kv(key: str, value):
+    """
+    Turn a metadata key/value into typed projection rows.
+    Returns list[dict] with keys:
+      key, ordinal, and one of val_str / val_num / val_bool / val_json (others None)
+    """
+    rows: list[dict] = []
+
+    def _null_row(ordinal: int) -> dict:
+        return {
+            "key": key, "ordinal": ordinal,
+            "val_str": None, "val_num": None, "val_bool": None, "val_json": None
+        }
+
+    if value is None:
+        rows.append(_null_row(0))
+        return rows
+
+    if is_scalar(value):
+        if isinstance(value, bool):
+            rows.append({"key": key, "ordinal": 0, "val_bool": bool(value)})
+        elif isinstance(value, (int, float, Decimal)):
+            num = value if isinstance(value, Decimal) else Decimal(str(value))
+            rows.append({"key": key, "ordinal": 0, "val_num": num})
+        elif isinstance(value, str):
+            rows.append({"key": key, "ordinal": 0, "val_str": value})
+        else:
+            rows.append({"key": key, "ordinal": 0, "val_json": value})
+        return rows
+
+    if isinstance(value, list):
+        if all(is_scalar(x) for x in value):
+            for i, x in enumerate(value):
+                if x is None:
+                    rows.append(_null_row(i))
+                elif isinstance(x, bool):
+                    rows.append({"key": key, "ordinal": i, "val_bool": bool(x)})
+                elif isinstance(x, (int, float, Decimal)):
+                    num = x if isinstance(x, Decimal) else Decimal(str(x))
+                    rows.append({"key": key, "ordinal": i, "val_num": num})
+                elif isinstance(x, str):
+                    rows.append({"key": key, "ordinal": i, "val_str": x})
+                else:
+                    rows.append({"key": key, "ordinal": i, "val_json": x})
+            return rows
+        for i, x in enumerate(value):
+            rows.append({"key": key, "ordinal": i, "val_json": x})
+        return rows
+
+    rows.append({"key": key, "ordinal": 0, "val_json": value})
+    return rows
@@ -1,13 +1,33 @@
+import os
+import mimetypes
+import contextlib
 from typing import Sequence

 from app.database.db import create_session
-from app.assets.api import schemas_out
+from app.assets.api import schemas_out, schemas_in
 from app.assets.database.queries import (
    asset_exists_by_hash,
+    asset_info_exists_for_asset_id,
+    get_asset_by_hash,
+    get_asset_info_by_id,
    fetch_asset_info_asset_and_tags,
+    fetch_asset_info_and_asset,
+    create_asset_info_for_existing_asset,
+    touch_asset_info_by_id,
+    update_asset_info_full,
+    delete_asset_info_by_id,
+    list_cache_states_by_asset_id,
    list_asset_infos_page,
    list_tags_with_usage,
+    get_asset_tags,
+    add_tags_to_asset_info,
+    remove_tags_from_asset_info,
+    pick_best_live_path,
+    ingest_fs_asset,
+    set_asset_info_preview,
 )
+from app.assets.helpers import resolve_destination_from_tags, ensure_within_base
+from app.assets.database.models import Asset


 def _safe_sort_field(requested: str | None) -> str:
@@ -19,11 +39,28 @@ def _safe_sort_field(requested: str | None) -> str:
    return "created_at"


-def asset_exists(asset_hash: str) -> bool:
+def _get_size_mtime_ns(path: str) -> tuple[int, int]:
+    st = os.stat(path, follow_symlinks=True)
+    return st.st_size, getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000))
+
+
+def _safe_filename(name: str | None, fallback: str) -> str:
+    n = os.path.basename((name or "").strip() or fallback)
+    if n:
+        return n
+    return fallback
+
+
+def asset_exists(*, asset_hash: str) -> bool:
+    """
+    Check if an asset with a given hash exists in database.
+    """
    with create_session() as session:
        return asset_exists_by_hash(session, asset_hash=asset_hash)

+
 def list_assets(
+    *,
    include_tags: Sequence[str] | None = None,
    exclude_tags: Sequence[str] | None = None,
    name_contains: str | None = None,
@@ -63,7 +100,6 @@ def list_assets(
                size=int(asset.size_bytes) if asset else None,
                mime_type=asset.mime_type if asset else None,
                tags=tags,
-                preview_url=f"/api/assets/{info.id}/content",
                created_at=info.created_at,
                updated_at=info.updated_at,
                last_access_time=info.last_access_time,
@@ -76,7 +112,12 @@ def list_assets(
        has_more=(offset + len(summaries)) < total,
    )

-def get_asset(asset_info_id: str, owner_id: str = "") -> schemas_out.AssetDetail:
+
+def get_asset(
+    *,
+    asset_info_id: str,
+    owner_id: str = "",
+) -> schemas_out.AssetDetail:
    with create_session() as session:
        res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
        if not res:
@@ -97,6 +138,358 @@ def get_asset(asset_info_id: str, owner_id: str = "") -> schemas_out.AssetDetail
        last_access_time=info.last_access_time,
    )

+
+def resolve_asset_content_for_download(
+    *,
+    asset_info_id: str,
+    owner_id: str = "",
+) -> tuple[str, str, str]:
+    with create_session() as session:
+        pair = fetch_asset_info_and_asset(session, asset_info_id=asset_info_id, owner_id=owner_id)
+        if not pair:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+        info, asset = pair
+        states = list_cache_states_by_asset_id(session, asset_id=asset.id)
+        abs_path = pick_best_live_path(states)
+        if not abs_path:
+            raise FileNotFoundError
+
+        touch_asset_info_by_id(session, asset_info_id=asset_info_id)
+        session.commit()
+
+        ctype = asset.mime_type or mimetypes.guess_type(info.name or abs_path)[0] or "application/octet-stream"
+        download_name = info.name or os.path.basename(abs_path)
+        return abs_path, ctype, download_name
+
+
+def upload_asset_from_temp_path(
+    spec: schemas_in.UploadAssetSpec,
+    *,
+    temp_path: str,
+    client_filename: str | None = None,
+    owner_id: str = "",
+    expected_asset_hash: str | None = None,
+) -> schemas_out.AssetCreated:
+    """
+    Create new asset or update existing asset from a temporary file path.
+    """
+    try:
+        # NOTE: blake3 is not required right now, so this will fail if blake3 is not installed in local environment
+        import app.assets.hashing as hashing
+        digest = hashing.blake3_hash(temp_path)
+    except Exception as e:
+        raise RuntimeError(f"failed to hash uploaded file: {e}")
+    asset_hash = "blake3:" + digest
+
+    if expected_asset_hash and asset_hash != expected_asset_hash.strip().lower():
+        raise ValueError("HASH_MISMATCH")
+
+    with create_session() as session:
+        existing = get_asset_by_hash(session, asset_hash=asset_hash)
+        if existing is not None:
+            with contextlib.suppress(Exception):
+                if temp_path and os.path.exists(temp_path):
+                    os.remove(temp_path)
+
+            display_name = _safe_filename(spec.name or (client_filename or ""), fallback=digest)
+            info = create_asset_info_for_existing_asset(
+                session,
+                asset_hash=asset_hash,
+                name=display_name,
+                user_metadata=spec.user_metadata or {},
+                tags=spec.tags or [],
+                tag_origin="manual",
+                owner_id=owner_id,
+            )
+            tag_names = get_asset_tags(session, asset_info_id=info.id)
+            session.commit()
+
+            return schemas_out.AssetCreated(
+                id=info.id,
+                name=info.name,
+                asset_hash=existing.hash,
+                size=int(existing.size_bytes) if existing.size_bytes is not None else None,
+                mime_type=existing.mime_type,
+                tags=tag_names,
+                user_metadata=info.user_metadata or {},
+                preview_id=info.preview_id,
+                created_at=info.created_at,
+                last_access_time=info.last_access_time,
+                created_new=False,
+            )
+
+    base_dir, subdirs = resolve_destination_from_tags(spec.tags)
+    dest_dir = os.path.join(base_dir, *subdirs) if subdirs else base_dir
+    os.makedirs(dest_dir, exist_ok=True)
+
+    src_for_ext = (client_filename or spec.name or "").strip()
+    _ext = os.path.splitext(os.path.basename(src_for_ext))[1] if src_for_ext else ""
+    ext = _ext if 0 < len(_ext) <= 16 else ""
+    hashed_basename = f"{digest}{ext}"
+    dest_abs = os.path.abspath(os.path.join(dest_dir, hashed_basename))
+    ensure_within_base(dest_abs, base_dir)
+
+    content_type = (
+        mimetypes.guess_type(os.path.basename(src_for_ext), strict=False)[0]
+        or mimetypes.guess_type(hashed_basename, strict=False)[0]
+        or "application/octet-stream"
+    )
+
+    try:
+        os.replace(temp_path, dest_abs)
+    except Exception as e:
+        raise RuntimeError(f"failed to move uploaded file into place: {e}")
+
+    try:
+        size_bytes, mtime_ns = _get_size_mtime_ns(dest_abs)
+    except OSError as e:
+        raise RuntimeError(f"failed to stat destination file: {e}")
+
+    with create_session() as session:
+        result = ingest_fs_asset(
+            session,
+            asset_hash=asset_hash,
+            abs_path=dest_abs,
+            size_bytes=size_bytes,
+            mtime_ns=mtime_ns,
+            mime_type=content_type,
+            info_name=_safe_filename(spec.name or (client_filename or ""), fallback=digest),
+            owner_id=owner_id,
+            preview_id=None,
+            user_metadata=spec.user_metadata or {},
+            tags=spec.tags,
+            tag_origin="manual",
+            require_existing_tags=False,
+        )
+        info_id = result["asset_info_id"]
+        if not info_id:
+            raise RuntimeError("failed to create asset metadata")
+
+        pair = fetch_asset_info_and_asset(session, asset_info_id=info_id, owner_id=owner_id)
+        if not pair:
+            raise RuntimeError("inconsistent DB state after ingest")
+        info, asset = pair
+        tag_names = get_asset_tags(session, asset_info_id=info.id)
+        created_result = schemas_out.AssetCreated(
+            id=info.id,
+            name=info.name,
+            asset_hash=asset.hash,
+            size=int(asset.size_bytes),
+            mime_type=asset.mime_type,
+            tags=tag_names,
+            user_metadata=info.user_metadata or {},
+            preview_id=info.preview_id,
+            created_at=info.created_at,
+            last_access_time=info.last_access_time,
+            created_new=result["asset_created"],
+        )
+        session.commit()
+
+    return created_result
+
+
+def update_asset(
+    *,
+    asset_info_id: str,
+    name: str | None = None,
+    tags: list[str] | None = None,
+    user_metadata: dict | None = None,
+    owner_id: str = "",
+) -> schemas_out.AssetUpdated:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        if not info_row:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+        if info_row.owner_id and info_row.owner_id != owner_id:
+            raise PermissionError("not owner")
+
+        info = update_asset_info_full(
+            session,
+            asset_info_id=asset_info_id,
+            name=name,
+            tags=tags,
+            user_metadata=user_metadata,
+            tag_origin="manual",
+            asset_info_row=info_row,
+        )
+
+        tag_names = get_asset_tags(session, asset_info_id=asset_info_id)
+        result = schemas_out.AssetUpdated(
+            id=info.id,
+            name=info.name,
+            asset_hash=info.asset.hash if info.asset else None,
+            tags=tag_names,
+            user_metadata=info.user_metadata or {},
+            updated_at=info.updated_at,
+        )
+        session.commit()
+
+    return result
+
+
+def set_asset_preview(
+    *,
+    asset_info_id: str,
+    preview_asset_id: str | None = None,
+    owner_id: str = "",
+) -> schemas_out.AssetDetail:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        if not info_row:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+        if info_row.owner_id and info_row.owner_id != owner_id:
+            raise PermissionError("not owner")
+
+        set_asset_info_preview(
+            session,
+            asset_info_id=asset_info_id,
+            preview_asset_id=preview_asset_id,
+        )
+
+        res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
+        if not res:
+            raise RuntimeError("State changed during preview update")
+        info, asset, tags = res
+        result = schemas_out.AssetDetail(
+            id=info.id,
+            name=info.name,
+            asset_hash=asset.hash if asset else None,
+            size=int(asset.size_bytes) if asset and asset.size_bytes is not None else None,
+            mime_type=asset.mime_type if asset else None,
+            tags=tags,
+            user_metadata=info.user_metadata or {},
+            preview_id=info.preview_id,
+            created_at=info.created_at,
+            last_access_time=info.last_access_time,
+        )
+        session.commit()
+
+    return result
+
+
+def delete_asset_reference(*, asset_info_id: str, owner_id: str, delete_content_if_orphan: bool = True) -> bool:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        asset_id = info_row.asset_id if info_row else None
+        deleted = delete_asset_info_by_id(session, asset_info_id=asset_info_id, owner_id=owner_id)
+        if not deleted:
+            session.commit()
+            return False
+
+        if not delete_content_if_orphan or not asset_id:
+            session.commit()
+            return True
+
+        still_exists = asset_info_exists_for_asset_id(session, asset_id=asset_id)
+        if still_exists:
+            session.commit()
+            return True
+
+        states = list_cache_states_by_asset_id(session, asset_id=asset_id)
+        file_paths = [s.file_path for s in (states or []) if getattr(s, "file_path", None)]
+
+        asset_row = session.get(Asset, asset_id)
+        if asset_row is not None:
+            session.delete(asset_row)
+
+        session.commit()
+        for p in file_paths:
+            with contextlib.suppress(Exception):
+                if p and os.path.isfile(p):
+                    os.remove(p)
+    return True
+
+
+def create_asset_from_hash(
+    *,
+    hash_str: str,
+    name: str,
+    tags: list[str] | None = None,
+    user_metadata: dict | None = None,
+    owner_id: str = "",
+) -> schemas_out.AssetCreated | None:
+    canonical = hash_str.strip().lower()
+    with create_session() as session:
+        asset = get_asset_by_hash(session, asset_hash=canonical)
+        if not asset:
+            return None
+
+        info = create_asset_info_for_existing_asset(
+            session,
+            asset_hash=canonical,
+            name=_safe_filename(name, fallback=canonical.split(":", 1)[1]),
+            user_metadata=user_metadata or {},
+            tags=tags or [],
+            tag_origin="manual",
+            owner_id=owner_id,
+        )
+        tag_names = get_asset_tags(session, asset_info_id=info.id)
+        result = schemas_out.AssetCreated(
+            id=info.id,
+            name=info.name,
+            asset_hash=asset.hash,
+            size=int(asset.size_bytes),
+            mime_type=asset.mime_type,
+            tags=tag_names,
+            user_metadata=info.user_metadata or {},
+            preview_id=info.preview_id,
+            created_at=info.created_at,
+            last_access_time=info.last_access_time,
+            created_new=False,
+        )
+        session.commit()
+
+    return result
+
+
+def add_tags_to_asset(
+    *,
+    asset_info_id: str,
+    tags: list[str],
+    origin: str = "manual",
+    owner_id: str = "",
+) -> schemas_out.TagsAdd:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        if not info_row:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+        if info_row.owner_id and info_row.owner_id != owner_id:
+            raise PermissionError("not owner")
+        data = add_tags_to_asset_info(
+            session,
+            asset_info_id=asset_info_id,
+            tags=tags,
+            origin=origin,
+            create_if_missing=True,
+            asset_info_row=info_row,
+        )
+        session.commit()
+    return schemas_out.TagsAdd(**data)
+
+
+def remove_tags_from_asset(
+    *,
+    asset_info_id: str,
+    tags: list[str],
+    owner_id: str = "",
+) -> schemas_out.TagsRemove:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        if not info_row:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+        if info_row.owner_id and info_row.owner_id != owner_id:
+            raise PermissionError("not owner")
+
+        data = remove_tags_from_asset_info(
+            session,
+            asset_info_id=asset_info_id,
+            tags=tags,
+        )
+        session.commit()
+    return schemas_out.TagsRemove(**data)
+
+
 def list_tags(
    prefix: str | None = None,
    limit: int = 100,
@@ -27,6 +27,7 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No
    t_start = time.perf_counter()
    created = 0
    skipped_existing = 0
+    orphans_pruned = 0
    paths: list[str] = []
    try:
        existing_paths: set[str] = set()
@@ -38,6 +39,11 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No
            except Exception as e:
                logging.exception("fast DB scan failed for %s: %s", r, e)

+        try:
+            orphans_pruned = _prune_orphaned_assets(roots)
+        except Exception as e:
+            logging.exception("orphan pruning failed: %s", e)
+
        if "models" in roots:
            paths.extend(collect_models_files())
        if "input" in roots:
@@ -85,15 +91,43 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No
    finally:
        if enable_logging:
            logging.info(
-                "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, total_seen=%d)",
+                "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, orphans_pruned=%d, total_seen=%d)",
                roots,
                time.perf_counter() - t_start,
                created,
                skipped_existing,
+                orphans_pruned,
                len(paths),
            )


+def _prune_orphaned_assets(roots: tuple[RootType, ...]) -> int:
+    """Prune cache states outside configured prefixes, then delete orphaned seed assets."""
+    all_prefixes = [os.path.abspath(p) for r in roots for p in prefixes_for_root(r)]
+    if not all_prefixes:
+        return 0
+
+    def make_prefix_condition(prefix: str):
+        base = prefix if prefix.endswith(os.sep) else prefix + os.sep
+        escaped, esc = escape_like_prefix(base)
+        return AssetCacheState.file_path.like(escaped + "%", escape=esc)
+
+    matches_valid_prefix = sqlalchemy.or_(*[make_prefix_condition(p) for p in all_prefixes])
+
+    orphan_subq = (
+        sqlalchemy.select(Asset.id)
+        .outerjoin(AssetCacheState, AssetCacheState.asset_id == Asset.id)
+        .where(Asset.hash.is_(None), AssetCacheState.id.is_(None))
+    ).scalar_subquery()
+
+    with create_session() as sess:
+        sess.execute(sqlalchemy.delete(AssetCacheState).where(~matches_valid_prefix))
+        sess.execute(sqlalchemy.delete(AssetInfo).where(AssetInfo.asset_id.in_(orphan_subq)))
+        result = sess.execute(sqlalchemy.delete(Asset).where(Asset.id.in_(orphan_subq)))
+        sess.commit()
+        return result.rowcount
+
+
 def _fast_db_consistency_pass(
    root: RootType,
    *,
@@ -25,11 +25,11 @@ class AudioEncoderModel():
        elif model_type == "whisper3":
            self.model = WhisperLargeV3(**model_config)
        self.model.eval()
-        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+        self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
        self.model_sample_rate = 16000

    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=False)
+        return self.model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())

    def get_sd(self):
        return self.model.state_dict()
@@ -159,6 +159,7 @@ class PerformanceFeature(enum.Enum):
    Fp8MatrixMultiplication = "fp8_matrix_mult"
    CublasOps = "cublas_ops"
    AutoTune = "autotune"
+    DynamicVRAM = "dynamic_vram"

 parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))

@@ -257,3 +258,6 @@ elif args.fast == []:
 # '--fast' is provided with a list of performance features, use that list
 else:
    args.fast = set(args.fast)
+
+def enables_dynamic_vram():
+    return PerformanceFeature.DynamicVRAM in args.fast and not args.highvram and not args.gpu_only
@@ -47,10 +47,10 @@ class ClipVisionModel():
        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
        self.model.eval()

-        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+        self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)

    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=False)
+        return self.model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())

    def get_sd(self):
        return self.model.state_dict()
@@ -236,6 +236,8 @@ class ComfyNodeABC(ABC):
    """Flags a node as experimental, informing users that it may change or not work as expected."""
    DEPRECATED: bool
    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
+    DEV_ONLY: bool
+    """Flags a node as dev-only, hiding it from search/menus unless dev mode is enabled."""
    API_NODE: Optional[bool]
    """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview."""

@@ -203,7 +203,7 @@ class ControlNet(ControlBase):
        self.control_model = control_model
        self.load_device = load_device
        if control_model is not None:
-            self.control_model_wrapped = comfy.model_patcher.ModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
+            self.control_model_wrapped = comfy.model_patcher.CoreModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())

        self.compression_ratio = compression_ratio
        self.global_average_pooling = global_average_pooling
@@ -1,11 +1,12 @@
 import math
+import time
 from functools import partial

 from scipy import integrate
 import torch
 from torch import nn
 import torchsde
-from tqdm.auto import trange, tqdm
+from tqdm.auto import trange as trange_, tqdm

 from . import utils
 from . import deis
@@ -13,6 +14,36 @@ from . import sa_solver
 import comfy.model_patcher
 import comfy.model_sampling

+import comfy.memory_management
+
+
+def trange(*args, **kwargs):
+    if comfy.memory_management.aimdo_allocator is None:
+        return trange_(*args, **kwargs)
+
+    pbar = trange_(*args, **kwargs, smoothing=1.0)
+    pbar._i = 0
+    pbar.set_postfix_str("  Model Initializing ...  ")
+
+    _update = pbar.update
+
+    def warmup_update(n=1):
+        pbar._i += 1
+        if pbar._i == 1:
+            pbar.i1_time = time.time()
+            pbar.set_postfix_str(" Model Initialization complete!  ")
+        elif pbar._i == 2:
+            #bring forward the effective start time based the the diff between first and second iteration
+            #to attempt to remove load overhead from the final step rate estimate.
+            pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time)
+            pbar.set_postfix_str("")
+
+        _update(n)
+
+    pbar.update = warmup_update
+    return pbar
+
+
 def append_zero(x):
    return torch.cat([x, x.new_zeros([1])])

@@ -8,6 +8,7 @@ class LatentFormat:
    latent_rgb_factors_bias = None
    latent_rgb_factors_reshape = None
    taesd_decoder_name = None
+    spacial_downscale_ratio = 8

    def process_in(self, latent):
        return latent * self.scale_factor
@@ -80,6 +81,7 @@ class SD_X4(LatentFormat):

 class SC_Prior(LatentFormat):
    latent_channels = 16
+    spacial_downscale_ratio = 42
    def __init__(self):
        self.scale_factor = 1.0
        self.latent_rgb_factors = [
@@ -102,6 +104,7 @@ class SC_Prior(LatentFormat):
        ]

 class SC_B(LatentFormat):
+    spacial_downscale_ratio = 4
    def __init__(self):
        self.scale_factor = 1.0 / 0.43
        self.latent_rgb_factors = [
@@ -181,6 +184,7 @@ class Flux(SD3):

 class Flux2(LatentFormat):
    latent_channels = 128
+    spacial_downscale_ratio = 16

    def __init__(self):
        self.latent_rgb_factors =[
@@ -272,6 +276,7 @@ class Mochi(LatentFormat):
 class LTXV(LatentFormat):
    latent_channels = 128
    latent_dimensions = 3
+    spacial_downscale_ratio = 32

    def __init__(self):
        self.latent_rgb_factors = [
@@ -515,6 +520,7 @@ class Wan21(LatentFormat):
 class Wan22(Wan21):
    latent_channels = 48
    latent_dimensions = 3
+    spacial_downscale_ratio = 16

    latent_rgb_factors = [
            [ 0.0119,  0.0103,  0.0046],
@@ -592,6 +598,7 @@ class Wan22(Wan21):
 class HunyuanImage21(LatentFormat):
    latent_channels = 64
    latent_dimensions = 2
+    spacial_downscale_ratio = 32
    scale_factor = 0.75289

    latent_rgb_factors = [
@@ -725,6 +732,7 @@ class HunyuanVideo15(LatentFormat):
    latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644]
    latent_channels = 32
    latent_dimensions = 3
+    spacial_downscale_ratio = 16
    scale_factor = 1.03682
    taesd_decoder_name = "lighttaehy1_5"

@@ -747,8 +755,13 @@ class ACEAudio(LatentFormat):
    latent_channels = 8
    latent_dimensions = 2

+class ACEAudio15(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+
 class ChromaRadiance(LatentFormat):
    latent_channels = 3
+    spacial_downscale_ratio = 1

    def __init__(self):
        self.latent_rgb_factors = [
@@ -0,0 +1,202 @@
+from comfy.ldm.cosmos.predict2 import MiniTrainDIT
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(x, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim):
+        super().__init__()
+        self.rope_theta = 10000
+        inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).to(dtype=torch.float) / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, head_dim, device=None, dtype=None, operations=None):
+        super().__init__()
+
+        inner_dim = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.query_dim = query_dim
+        self.context_dim = context_dim
+
+        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+
+        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+
+        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+
+        self.o_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x, mask=None, context=None, position_embeddings=None, position_embeddings_context=None):
+        context = x if context is None else context
+        input_shape = x.shape[:-1]
+        q_shape = (*input_shape, self.n_heads, self.head_dim)
+        context_shape = context.shape[:-1]
+        kv_shape = (*context_shape, self.n_heads, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(x).view(q_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(context).view(kv_shape)).transpose(1, 2)
+        value_states = self.v_proj(context).view(kv_shape).transpose(1, 2)
+
+        if position_embeddings is not None:
+            assert position_embeddings_context is not None
+            cos, sin = position_embeddings
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+            cos, sin = position_embeddings_context
+            key_states = apply_rotary_pos_emb(key_states, cos, sin)
+
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=mask)
+
+        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+    def init_weights(self):
+        torch.nn.init.zeros_(self.o_proj.weight)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, source_dim, model_dim, num_heads=16, mlp_ratio=4.0, use_self_attn=False, layer_norm=False, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.use_self_attn = use_self_attn
+
+        if self.use_self_attn:
+            self.norm_self_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+            self.self_attn = Attention(
+                query_dim=model_dim,
+                context_dim=model_dim,
+                n_heads=num_heads,
+                head_dim=model_dim//num_heads,
+                device=device,
+                dtype=dtype,
+                operations=operations,
+            )
+
+        self.norm_cross_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.cross_attn = Attention(
+            query_dim=model_dim,
+            context_dim=source_dim,
+            n_heads=num_heads,
+            head_dim=model_dim//num_heads,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+        self.norm_mlp = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = nn.Sequential(
+            operations.Linear(model_dim, int(model_dim * mlp_ratio), device=device, dtype=dtype),
+            nn.GELU(),
+            operations.Linear(int(model_dim * mlp_ratio), model_dim, device=device, dtype=dtype)
+        )
+
+    def forward(self, x, context, target_attention_mask=None, source_attention_mask=None, position_embeddings=None, position_embeddings_context=None):
+        if self.use_self_attn:
+            normed = self.norm_self_attn(x)
+            attn_out = self.self_attn(normed, mask=target_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings)
+            x = x + attn_out
+
+        normed = self.norm_cross_attn(x)
+        attn_out = self.cross_attn(normed, mask=source_attention_mask, context=context, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        x = x + attn_out
+
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+
+    def init_weights(self):
+        torch.nn.init.zeros_(self.mlp[2].weight)
+        self.cross_attn.init_weights()
+
+
+class LLMAdapter(nn.Module):
+    def __init__(
+            self,
+            source_dim=1024,
+            target_dim=1024,
+            model_dim=1024,
+            num_layers=6,
+            num_heads=16,
+            use_self_attn=True,
+            layer_norm=False,
+            device=None,
+            dtype=None,
+            operations=None,
+        ):
+        super().__init__()
+
+        self.embed = operations.Embedding(32128, target_dim, device=device, dtype=dtype)
+        if model_dim != target_dim:
+            self.in_proj = operations.Linear(target_dim, model_dim, device=device, dtype=dtype)
+        else:
+            self.in_proj = nn.Identity()
+        self.rotary_emb = RotaryEmbedding(model_dim//num_heads)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(source_dim, model_dim, num_heads=num_heads, use_self_attn=use_self_attn, layer_norm=layer_norm, device=device, dtype=dtype, operations=operations) for _ in range(num_layers)
+        ])
+        self.out_proj = operations.Linear(model_dim, target_dim, device=device, dtype=dtype)
+        self.norm = operations.RMSNorm(target_dim, eps=1e-6, device=device, dtype=dtype)
+
+    def forward(self, source_hidden_states, target_input_ids, target_attention_mask=None, source_attention_mask=None):
+        if target_attention_mask is not None:
+            target_attention_mask = target_attention_mask.to(torch.bool)
+            if target_attention_mask.ndim == 2:
+                target_attention_mask = target_attention_mask.unsqueeze(1).unsqueeze(1)
+
+        if source_attention_mask is not None:
+            source_attention_mask = source_attention_mask.to(torch.bool)
+            if source_attention_mask.ndim == 2:
+                source_attention_mask = source_attention_mask.unsqueeze(1).unsqueeze(1)
+
+        x = self.in_proj(self.embed(target_input_ids))
+        context = source_hidden_states
+        position_ids = torch.arange(x.shape[1], device=x.device).unsqueeze(0)
+        position_ids_context = torch.arange(context.shape[1], device=x.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(x, position_ids)
+        position_embeddings_context = self.rotary_emb(x, position_ids_context)
+        for block in self.blocks:
+            x = block(x, context, target_attention_mask=target_attention_mask, source_attention_mask=source_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        return self.norm(self.out_proj(x))
+
+
+class Anima(MiniTrainDIT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations"))
+
+    def preprocess_text_embeds(self, text_embeds, text_ids):
+        if text_ids is not None:
+            return self.llm_adapter(text_embeds, text_ids)
+        else:
+            return text_embeds
@@ -13,6 +13,7 @@ from torchvision import transforms

 import comfy.patcher_extension
 from comfy.ldm.modules.attention import optimized_attention
+import comfy.ldm.common_dit

 def apply_rotary_pos_emb(
    t: torch.Tensor,
@@ -835,6 +836,8 @@ class MiniTrainDIT(nn.Module):
        padding_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ):
+        orig_shape = list(x.shape)
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_temporal, self.patch_spatial, self.patch_spatial))
        x_B_C_T_H_W = x
        timesteps_B_T = timesteps
        crossattn_emb = context
@@ -882,5 +885,5 @@ class MiniTrainDIT(nn.Module):
            )

        x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
-        x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)
+        x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)[:, :, :orig_shape[-3], :orig_shape[-2], :orig_shape[-1]]
        return x_B_C_Tt_Hp_Wp
@@ -109,10 +109,10 @@ class HunyuanVideo15SRModel():
        self.model_class = UPSAMPLERS.get(model_type)
        self.model = self.model_class(**config).eval()

-        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+        self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)

    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=True)
+        return self.model.load_state_dict(sd, strict=True, assign=self.patcher.is_dynamic())

    def get_sd(self):
        return self.model.state_dict()
@@ -18,12 +18,12 @@ class CompressedTimestep:
    def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
        """
        tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
-        patches_per_frame: Number of spatial patches per frame (height * width in latent space)
+        patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression
        """
        self.batch_size, num_tokens, self.feature_dim = tensor.shape

        # Check if compression is valid (num_tokens must be divisible by patches_per_frame)
-        if num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
+        if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
            self.patches_per_frame = patches_per_frame
            self.num_frames = num_tokens // patches_per_frame

@@ -215,22 +215,9 @@ class BasicAVTransformerBlock(nn.Module):
        return (*scale_shift_ada_values, *gate_ada_values)

    def forward(
-        self,
-        x: Tuple[torch.Tensor, torch.Tensor],
-        v_context=None,
-        a_context=None,
-        attention_mask=None,
-        v_timestep=None,
-        a_timestep=None,
-        v_pe=None,
-        a_pe=None,
-        v_cross_pe=None,
-        a_cross_pe=None,
-        v_cross_scale_shift_timestep=None,
-        a_cross_scale_shift_timestep=None,
-        v_cross_gate_timestep=None,
-        a_cross_gate_timestep=None,
-        transformer_options=None,
+        self, x: Tuple[torch.Tensor, torch.Tensor], v_context=None, a_context=None, attention_mask=None, v_timestep=None, a_timestep=None,
+        v_pe=None, a_pe=None, v_cross_pe=None, a_cross_pe=None, v_cross_scale_shift_timestep=None, a_cross_scale_shift_timestep=None,
+        v_cross_gate_timestep=None, a_cross_gate_timestep=None, transformer_options=None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        run_vx = transformer_options.get("run_vx", True)
        run_ax = transformer_options.get("run_ax", True)
@@ -240,144 +227,102 @@ class BasicAVTransformerBlock(nn.Module):
        run_a2v = run_vx and transformer_options.get("a2v_cross_attn", True) and ax.numel() > 0
        run_v2a = run_ax and transformer_options.get("v2a_cross_attn", True)

+        # video
        if run_vx:
-            vshift_msa, vscale_msa, vgate_msa = (
-                self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 3))
-            )
-
+            # video self-attention
+            vshift_msa, vscale_msa = (self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 2)))
            norm_vx = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_msa) + vshift_msa
-            vx += self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options) * vgate_msa
-            vx += self.attn2(
-                comfy.ldm.common_dit.rms_norm(vx),
-                context=v_context,
-                mask=attention_mask,
-                transformer_options=transformer_options,
-            )
-
-            del vshift_msa, vscale_msa, vgate_msa
+            del vshift_msa, vscale_msa
+            attn1_out = self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options)
+            del norm_vx
+            # video cross-attention
+            vgate_msa = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(2, 3))[0]
+            vx.addcmul_(attn1_out, vgate_msa)
+            del vgate_msa, attn1_out
+            vx.add_(self.attn2(comfy.ldm.common_dit.rms_norm(vx), context=v_context, mask=attention_mask, transformer_options=transformer_options))

+        # audio
        if run_ax:
-            ashift_msa, ascale_msa, agate_msa = (
-                self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 3))
-            )
-
+            # audio self-attention
+            ashift_msa, ascale_msa = (self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 2)))
            norm_ax = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_msa) + ashift_msa
-            ax += (
-                self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options)
-                * agate_msa
-            )
-            ax += self.audio_attn2(
-                comfy.ldm.common_dit.rms_norm(ax),
-                context=a_context,
-                mask=attention_mask,
-                transformer_options=transformer_options,
-            )
+            del ashift_msa, ascale_msa
+            attn1_out = self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options)
+            del norm_ax
+            # audio cross-attention
+            agate_msa = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(2, 3))[0]
+            ax.addcmul_(attn1_out, agate_msa)
+            del agate_msa, attn1_out
+            ax.add_(self.audio_attn2(comfy.ldm.common_dit.rms_norm(ax), context=a_context, mask=attention_mask, transformer_options=transformer_options))

-            del ashift_msa, ascale_msa, agate_msa
-
-        # Audio - Video cross attention.
+        # video - audio cross attention.
        if run_a2v or run_v2a:
-            # norm3
            vx_norm3 = comfy.ldm.common_dit.rms_norm(vx)
            ax_norm3 = comfy.ldm.common_dit.rms_norm(ax)

-            (
-                scale_ca_audio_hidden_states_a2v,
-                shift_ca_audio_hidden_states_a2v,
-                scale_ca_audio_hidden_states_v2a,
-                shift_ca_audio_hidden_states_v2a,
-                gate_out_v2a,
-            ) = self.get_av_ca_ada_values(
-                self.scale_shift_table_a2v_ca_audio,
-                ax.shape[0],
-                a_cross_scale_shift_timestep,
-                a_cross_gate_timestep,
-            )
-
-            (
-                scale_ca_video_hidden_states_a2v,
-                shift_ca_video_hidden_states_a2v,
-                scale_ca_video_hidden_states_v2a,
-                shift_ca_video_hidden_states_v2a,
-                gate_out_a2v,
-            ) = self.get_av_ca_ada_values(
-                self.scale_shift_table_a2v_ca_video,
-                vx.shape[0],
-                v_cross_scale_shift_timestep,
-                v_cross_gate_timestep,
-            )
-
+            # audio to video cross attention
            if run_a2v:
-                vx_scaled = (
-                    vx_norm3 * (1 + scale_ca_video_hidden_states_a2v)
-                    + shift_ca_video_hidden_states_a2v
-                )
-                ax_scaled = (
-                    ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v)
-                    + shift_ca_audio_hidden_states_a2v
-                )
-                vx += (
-                    self.audio_to_video_attn(
-                        vx_scaled,
-                        context=ax_scaled,
-                        pe=v_cross_pe,
-                        k_pe=a_cross_pe,
-                        transformer_options=transformer_options,
-                    )
-                    * gate_out_a2v
-                )
+                scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[:2]
+                scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[:2]

-                del gate_out_a2v
-                del scale_ca_video_hidden_states_a2v,\
-                    shift_ca_video_hidden_states_a2v,\
-                    scale_ca_audio_hidden_states_a2v,\
-                    shift_ca_audio_hidden_states_a2v,\
+                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_a2v_v) + shift_ca_video_hidden_states_a2v_v
+                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) + shift_ca_audio_hidden_states_a2v
+                del scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v, scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v

+                a2v_out = self.audio_to_video_attn(vx_scaled, context=ax_scaled, pe=v_cross_pe, k_pe=a_cross_pe, transformer_options=transformer_options)
+                del vx_scaled, ax_scaled
+
+                gate_out_a2v = self.get_ada_values(self.scale_shift_table_a2v_ca_video[4:, :], vx.shape[0], v_cross_gate_timestep)[0]
+                vx.addcmul_(a2v_out, gate_out_a2v)
+                del gate_out_a2v, a2v_out
+
+            # video to audio cross attention
            if run_v2a:
-                ax_scaled = (
-                    ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a)
-                    + shift_ca_audio_hidden_states_v2a
-                )
-                vx_scaled = (
-                    vx_norm3 * (1 + scale_ca_video_hidden_states_v2a)
-                    + shift_ca_video_hidden_states_v2a
-                )
-                ax += (
-                    self.video_to_audio_attn(
-                        ax_scaled,
-                        context=vx_scaled,
-                        pe=a_cross_pe,
-                        k_pe=v_cross_pe,
-                        transformer_options=transformer_options,
-                    )
-                    * gate_out_v2a
-                )
+                scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[2:4]
+                scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[2:4]

-                del gate_out_v2a
-                del scale_ca_video_hidden_states_v2a,\
-                    shift_ca_video_hidden_states_v2a,\
-                    scale_ca_audio_hidden_states_v2a,\
-                    shift_ca_audio_hidden_states_v2a
+                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) + shift_ca_audio_hidden_states_v2a
+                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) + shift_ca_video_hidden_states_v2a
+                del scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a, scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a

+                v2a_out = self.video_to_audio_attn(ax_scaled, context=vx_scaled, pe=a_cross_pe, k_pe=v_cross_pe, transformer_options=transformer_options)
+                del ax_scaled, vx_scaled
+
+                gate_out_v2a = self.get_ada_values(self.scale_shift_table_a2v_ca_audio[4:, :], ax.shape[0], a_cross_gate_timestep)[0]
+                ax.addcmul_(v2a_out, gate_out_v2a)
+                del gate_out_v2a, v2a_out
+
+            del vx_norm3, ax_norm3
+
+        # video feedforward
        if run_vx:
-            vshift_mlp, vscale_mlp, vgate_mlp = (
-                self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, None))
-            )
-
+            vshift_mlp, vscale_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, 5))
            vx_scaled = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_mlp) + vshift_mlp
-            vx += self.ff(vx_scaled) * vgate_mlp
-            del vshift_mlp, vscale_mlp, vgate_mlp
+            del vshift_mlp, vscale_mlp

+            ff_out = self.ff(vx_scaled)
+            del vx_scaled
+
+            vgate_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(5, 6))[0]
+            vx.addcmul_(ff_out, vgate_mlp)
+            del vgate_mlp, ff_out
+
+        # audio feedforward
        if run_ax:
-            ashift_mlp, ascale_mlp, agate_mlp = (
-                self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, None))
-            )
-
+            ashift_mlp, ascale_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, 5))
            ax_scaled = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_mlp) + ashift_mlp
-            ax += self.audio_ff(ax_scaled) * agate_mlp
+            del ashift_mlp, ascale_mlp

-            del ashift_mlp, ascale_mlp, agate_mlp
+            ff_out = self.audio_ff(ax_scaled)
+            del ax_scaled

+            agate_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(5, 6))[0]
+            ax.addcmul_(ff_out, agate_mlp)
+            del agate_mlp, ff_out

        return vx, ax

@@ -589,9 +534,20 @@ class LTXAVModel(LTXVModel):
        audio_length = kwargs.get("audio_length", 0)
        # Separate audio and video latents
        vx, ax = self.separate_audio_and_video_latents(x, audio_length)
+
+        has_spatial_mask = False
+        if denoise_mask is not None:
+            # check if any frame has spatial variation (inpainting)
+            for frame_idx in range(denoise_mask.shape[2]):
+                frame_mask = denoise_mask[0, 0, frame_idx]
+                if frame_mask.numel() > 0 and frame_mask.min() != frame_mask.max():
+                    has_spatial_mask = True
+                    break
+
        [vx, v_pixel_coords, additional_args] = super()._process_input(
            vx, keyframe_idxs, denoise_mask, **kwargs
        )
+        additional_args["has_spatial_mask"] = has_spatial_mask

        ax, a_latent_coords = self.a_patchifier.patchify(ax)
        ax = self.audio_patchify_proj(ax)
@@ -618,8 +574,9 @@ class LTXAVModel(LTXVModel):
        # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
        # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
        orig_shape = kwargs.get("orig_shape")
+        has_spatial_mask = kwargs.get("has_spatial_mask", None)
        v_patches_per_frame = None
-        if orig_shape is not None and len(orig_shape) == 5:
+        if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
            # orig_shape[3] = height, orig_shape[4] = width (in latent space)
            v_patches_per_frame = orig_shape[3] * orig_shape[4]

@@ -662,10 +619,11 @@ class LTXAVModel(LTXVModel):
            )

            # Compress cross-attention timesteps (only video side, audio is too small to benefit)
+            # v_patches_per_frame is None for spatial masks, set for temporal masks or no mask
            cross_av_timestep_ss = [
                av_ca_audio_scale_shift_timestep.view(batch_size, -1, av_ca_audio_scale_shift_timestep.shape[-1]),
-                CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame),  # video - compressed
-                CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame),  # video - compressed
+                CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame),  # video - compressed if possible
+                CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame),  # video - compressed if possible
                av_ca_v2a_gate_noise_timestep.view(batch_size, -1, av_ca_v2a_gate_noise_timestep.shape[-1]),
            ]

@@ -103,20 +103,10 @@ class AudioPreprocessor:
            return waveform
        return torchaudio.functional.resample(waveform, source_rate, self.target_sample_rate)

-    @staticmethod
-    def normalize_amplitude(
-        waveform: torch.Tensor, max_amplitude: float = 0.5, eps: float = 1e-5
-    ) -> torch.Tensor:
-        waveform = waveform - waveform.mean(dim=2, keepdim=True)
-        peak = torch.max(torch.abs(waveform)) + eps
-        scale = peak.clamp(max=max_amplitude) / peak
-        return waveform * scale
-
    def waveform_to_mel(
        self, waveform: torch.Tensor, waveform_sample_rate: int, device
    ) -> torch.Tensor:
        waveform = self.resample(waveform, waveform_sample_rate)
-        waveform = self.normalize_amplitude(waveform)

        mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.target_sample_rate,
@@ -1,11 +1,11 @@
 from typing import Tuple, Union

+import threading
 import torch
 import torch.nn as nn
 import comfy.ops
 ops = comfy.ops.disable_weight_init

-
 class CausalConv3d(nn.Module):
    def __init__(
        self,
@@ -42,23 +42,34 @@ class CausalConv3d(nn.Module):
            padding_mode=spatial_padding_mode,
            groups=groups,
        )
+        self.temporal_cache_state={}

    def forward(self, x, causal: bool = True):
-        if causal:
-            first_frame_pad = x[:, :, :1, :, :].repeat(
-                (1, 1, self.time_kernel_size - 1, 1, 1)
-            )
-            x = torch.concatenate((first_frame_pad, x), dim=2)
-        else:
-            first_frame_pad = x[:, :, :1, :, :].repeat(
-                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
-            )
-            last_frame_pad = x[:, :, -1:, :, :].repeat(
-                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
-            )
-            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
-        x = self.conv(x)
-        return x
+        tid = threading.get_ident()
+
+        cached, is_end = self.temporal_cache_state.get(tid, (None, False))
+        if cached is None:
+            padding_length = self.time_kernel_size - 1
+            if not causal:
+                padding_length = padding_length // 2
+            if x.shape[2] == 0:
+                return x
+            cached = x[:, :, :1, :, :].repeat((1, 1, padding_length, 1, 1))
+        pieces = [ cached, x ]
+        if is_end and not causal:
+            pieces.append(x[:, :, -1:, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1)))
+
+        needs_caching = not is_end
+        if needs_caching and x.shape[2] >= self.time_kernel_size - 1:
+            needs_caching = False
+            self.temporal_cache_state[tid] = (x[:, :, -(self.time_kernel_size - 1):, :, :], False)
+
+        x = torch.cat(pieces, dim=2)
+
+        if needs_caching:
+            self.temporal_cache_state[tid] = (x[:, :, -(self.time_kernel_size - 1):, :, :], False)
+
+        return self.conv(x) if x.shape[2] >= self.time_kernel_size else x[:, :, :0, :, :]

    @property
    def weight(self):
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import threading
 import torch
 from torch import nn
 from functools import partial
@@ -6,12 +7,35 @@ import math
 from einops import rearrange
 from typing import List, Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
+from .causal_conv3d import CausalConv3d
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
+from comfy.ldm.modules.diffusionmodules.model import torch_cat_if_needed

 ops = comfy.ops.disable_weight_init

+def mark_conv3d_ended(module):
+    tid = threading.get_ident()
+    for _, m in module.named_modules():
+        if isinstance(m, CausalConv3d):
+            current = m.temporal_cache_state.get(tid, (None, False))
+            m.temporal_cache_state[tid] = (current[0], True)
+
+def split2(tensor, split_point, dim=2):
+    return torch.split(tensor, [split_point, tensor.shape[dim] - split_point], dim=dim)
+
+def add_exchange_cache(dest, cache_in, new_input, dim=2):
+    if dest is not None:
+        if cache_in is not None:
+            cache_to_dest = min(dest.shape[dim], cache_in.shape[dim])
+            lead_in_dest, dest = split2(dest, cache_to_dest, dim=dim)
+            lead_in_source, cache_in = split2(cache_in, cache_to_dest, dim=dim)
+            lead_in_dest.add_(lead_in_source)
+        body, new_input = split2(new_input, dest.shape[dim], dim)
+        dest.add_(body)
+    return torch_cat_if_needed([cache_in, new_input], dim=dim)
+
 class Encoder(nn.Module):
    r"""
    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
@@ -205,7 +229,7 @@ class Encoder(nn.Module):

        self.gradient_checkpointing = False

-    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+    def forward_orig(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        r"""The forward method of the `Encoder` class."""

        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
@@ -254,6 +278,22 @@ class Encoder(nn.Module):

        return sample

+    def forward(self, *args, **kwargs):
+        #No encoder support so just flag the end so it doesnt use the cache.
+        mark_conv3d_ended(self)
+        try:
+            return self.forward_orig(*args, **kwargs)
+        finally:
+            tid = threading.get_ident()
+            for _, module in self.named_modules():
+                # ComfyUI doesn't thread this kind of stuff today, but just in case
+                # we key on the thread to make it thread safe.
+                tid = threading.get_ident()
+                if hasattr(module, "temporal_cache_state"):
+                    module.temporal_cache_state.pop(tid, None)
+
+
+MAX_CHUNK_SIZE=(128 * 1024 ** 2)

 class Decoder(nn.Module):
    r"""
@@ -341,18 +381,6 @@ class Decoder(nn.Module):
                    timestep_conditioning=timestep_conditioning,
                    spatial_padding_mode=spatial_padding_mode,
                )
-            elif block_name == "attn_res_x":
-                block = UNetMidBlock3D(
-                    dims=dims,
-                    in_channels=input_channel,
-                    num_layers=block_params["num_layers"],
-                    resnet_groups=norm_num_groups,
-                    norm_layer=norm_layer,
-                    inject_noise=block_params.get("inject_noise", False),
-                    timestep_conditioning=timestep_conditioning,
-                    attention_head_dim=block_params["attention_head_dim"],
-                    spatial_padding_mode=spatial_padding_mode,
-                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
                block = ResnetBlock3D(
@@ -428,8 +456,9 @@ class Decoder(nn.Module):
            )
            self.last_scale_shift_table = nn.Parameter(torch.empty(2, output_channel))

+
    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
-    def forward(
+    def forward_orig(
        self,
        sample: torch.FloatTensor,
        timestep: Optional[torch.Tensor] = None,
@@ -437,6 +466,7 @@ class Decoder(nn.Module):
        r"""The forward method of the `Decoder` class."""
        batch_size = sample.shape[0]

+        mark_conv3d_ended(self.conv_in)
        sample = self.conv_in(sample, causal=self.causal)

        checkpoint_fn = (
@@ -445,24 +475,12 @@ class Decoder(nn.Module):
            else lambda x: x
        )

-        scaled_timestep = None
+        timestep_shift_scale = None
        if self.timestep_conditioning:
            assert (
                timestep is not None
            ), "should pass timestep with timestep_conditioning=True"
            scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device)
-
-        for up_block in self.up_blocks:
-            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
-                sample = checkpoint_fn(up_block)(
-                    sample, causal=self.causal, timestep=scaled_timestep
-                )
-            else:
-                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
-
-        sample = self.conv_norm_out(sample)
-
-        if self.timestep_conditioning:
            embedded_timestep = self.last_time_embedder(
                timestep=scaled_timestep.flatten(),
                resolution=None,
@@ -483,16 +501,62 @@ class Decoder(nn.Module):
                embedded_timestep.shape[-2],
                embedded_timestep.shape[-1],
            )
-            shift, scale = ada_values.unbind(dim=1)
-            sample = sample * (1 + scale) + shift
+            timestep_shift_scale = ada_values.unbind(dim=1)

-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample, causal=self.causal)
+        output = []
+
+        def run_up(idx, sample, ended):
+            if idx >= len(self.up_blocks):
+                sample = self.conv_norm_out(sample)
+                if timestep_shift_scale is not None:
+                    shift, scale = timestep_shift_scale
+                    sample = sample * (1 + scale) + shift
+                sample = self.conv_act(sample)
+                if ended:
+                    mark_conv3d_ended(self.conv_out)
+                sample = self.conv_out(sample, causal=self.causal)
+                if sample is not None and sample.shape[2] > 0:
+                    output.append(sample)
+                return
+
+            up_block = self.up_blocks[idx]
+            if (ended):
+                mark_conv3d_ended(up_block)
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timestep=scaled_timestep
+                )
+            else:
+                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+
+            if sample is None or sample.shape[2] == 0:
+                return
+
+            total_bytes = sample.numel() * sample.element_size()
+            num_chunks = (total_bytes + MAX_CHUNK_SIZE - 1) // MAX_CHUNK_SIZE
+            samples = torch.chunk(sample, chunks=num_chunks, dim=2)
+
+            for chunk_idx, sample1 in enumerate(samples):
+                run_up(idx + 1, sample1, ended and chunk_idx == len(samples) - 1)
+
+        run_up(0, sample, True)
+        sample = torch.cat(output, dim=2)

        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)

        return sample

+    def forward(self, *args, **kwargs):
+        try:
+            return self.forward_orig(*args, **kwargs)
+        finally:
+            for _, module in self.named_modules():
+                #ComfyUI doesn't thread this kind of stuff today, but just incase
+                #we key on the thread to make it thread safe.
+                tid = threading.get_ident()
+                if hasattr(module, "temporal_cache_state"):
+                    module.temporal_cache_state.pop(tid, None)
+

 class UNetMidBlock3D(nn.Module):
    """
@@ -663,8 +727,22 @@ class DepthToSpaceUpsample(nn.Module):
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
+        self.temporal_cache_state = {}

    def forward(self, x, causal: bool = True, timestep: Optional[torch.Tensor] = None):
+        tid = threading.get_ident()
+        cached, drop_first_conv, drop_first_res = self.temporal_cache_state.get(tid, (None, True, True))
+        y = self.conv(x, causal=causal)
+        y = rearrange(
+            y,
+            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        if self.stride[0] == 2 and y.shape[2] > 0 and drop_first_conv:
+            y = y[:, :, 1:, :, :]
+            drop_first_conv = False
        if self.residual:
            # Reshape and duplicate the input to match the output shape
            x_in = rearrange(
@@ -676,21 +754,20 @@ class DepthToSpaceUpsample(nn.Module):
            )
            num_repeat = math.prod(self.stride) // self.out_channels_reduction_factor
            x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
-            if self.stride[0] == 2:
+            if self.stride[0] == 2 and x_in.shape[2] > 0 and drop_first_res:
                x_in = x_in[:, :, 1:, :, :]
-        x = self.conv(x, causal=causal)
-        x = rearrange(
-            x,
-            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-        if self.stride[0] == 2:
-            x = x[:, :, 1:, :, :]
-        if self.residual:
-            x = x + x_in
-        return x
+                drop_first_res = False
+
+            if y.shape[2] == 0:
+                y = None
+
+            cached = add_exchange_cache(y, cached, x_in, dim=2)
+            self.temporal_cache_state[tid] = (cached, drop_first_conv, drop_first_res)
+
+        else:
+            self.temporal_cache_state[tid] = (None, drop_first_conv, False)
+
+        return y

 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
@@ -807,6 +884,8 @@ class ResnetBlock3D(nn.Module):
                torch.randn(4, in_channels) / in_channels**0.5
            )

+        self.temporal_cache_state={}
+
    def _feed_spatial_noise(
        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
    ) -> torch.FloatTensor:
@@ -880,9 +959,12 @@ class ResnetBlock3D(nn.Module):

        input_tensor = self.conv_shortcut(input_tensor)

-        output_tensor = input_tensor + hidden_states
+        tid = threading.get_ident()
+        cached = self.temporal_cache_state.get(tid, None)
+        cached = add_exchange_cache(hidden_states, cached, input_tensor, dim=2)
+        self.temporal_cache_state[tid] = cached

-        return output_tensor
+        return hidden_states


 def patchify(x, patch_size_hw, patch_size_t=1):
@@ -13,10 +13,53 @@ from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
 from comfy.ldm.flux.math import apply_rope
 import comfy.patcher_extension
+import comfy.utils


-def modulate(x, scale):
-    return x * (1 + scale.unsqueeze(1))
+def invert_slices(slices, length):
+    sorted_slices = sorted(slices)
+    result = []
+    current = 0
+
+    for start, end in sorted_slices:
+        if current < start:
+            result.append((current, start))
+        current = max(current, end)
+
+    if current < length:
+        result.append((current, length))
+
+    return result
+
+
+def modulate(x, scale, timestep_zero_index=None):
+    if timestep_zero_index is None:
+        return x * (1 + scale.unsqueeze(1))
+    else:
+        scale = (1 + scale.unsqueeze(1))
+        actual_batch = scale.size(0) // 2
+        slices = timestep_zero_index
+        invert = invert_slices(timestep_zero_index, x.shape[1])
+        for s in slices:
+            x[:, s[0]:s[1]] *= scale[actual_batch:]
+        for s in invert:
+            x[:, s[0]:s[1]] *= scale[:actual_batch]
+        return x
+
+
+def apply_gate(gate, x, timestep_zero_index=None):
+    if timestep_zero_index is None:
+        return gate * x
+    else:
+        actual_batch = gate.size(0) // 2
+
+        slices = timestep_zero_index
+        invert = invert_slices(timestep_zero_index, x.shape[1])
+        for s in slices:
+            x[:, s[0]:s[1]] *= gate[actual_batch:]
+        for s in invert:
+            x[:, s[0]:s[1]] *= gate[:actual_batch]
+        return x

 #############################################################################
 #                               Core NextDiT Model                              #
@@ -258,6 +301,7 @@ class JointTransformerBlock(nn.Module):
        x_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
        adaln_input: Optional[torch.Tensor]=None,
+        timestep_zero_index=None,
        transformer_options={},
    ):
        """
@@ -276,18 +320,18 @@ class JointTransformerBlock(nn.Module):
            assert adaln_input is not None
            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)

-            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
+            x = x + apply_gate(gate_msa.unsqueeze(1).tanh(), self.attention_norm2(
                clamp_fp16(self.attention(
-                    modulate(self.attention_norm1(x), scale_msa),
+                    modulate(self.attention_norm1(x), scale_msa, timestep_zero_index=timestep_zero_index),
                    x_mask,
                    freqs_cis,
                    transformer_options=transformer_options,
-                ))
+                ))), timestep_zero_index=timestep_zero_index
            )
-            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
+            x = x + apply_gate(gate_mlp.unsqueeze(1).tanh(), self.ffn_norm2(
                clamp_fp16(self.feed_forward(
-                    modulate(self.ffn_norm1(x), scale_mlp),
-                ))
+                    modulate(self.ffn_norm1(x), scale_mlp, timestep_zero_index=timestep_zero_index),
+                ))), timestep_zero_index=timestep_zero_index
            )
        else:
            assert adaln_input is None
@@ -345,13 +389,37 @@ class FinalLayer(nn.Module):
            ),
        )

-    def forward(self, x, c):
+    def forward(self, x, c, timestep_zero_index=None):
        scale = self.adaLN_modulation(c)
-        x = modulate(self.norm_final(x), scale)
+        x = modulate(self.norm_final(x), scale, timestep_zero_index=timestep_zero_index)
        x = self.linear(x)
        return x


+def pad_zimage(feats, pad_token, pad_tokens_multiple):
+    pad_extra = (-feats.shape[1]) % pad_tokens_multiple
+    return torch.cat((feats, pad_token.to(device=feats.device, dtype=feats.dtype, copy=True).unsqueeze(0).repeat(feats.shape[0], pad_extra, 1)), dim=1), pad_extra
+
+
+def pos_ids_x(start_t, H_tokens, W_tokens, batch_size, device, transformer_options={}):
+    rope_options = transformer_options.get("rope_options", None)
+    h_scale = 1.0
+    w_scale = 1.0
+    h_start = 0
+    w_start = 0
+    if rope_options is not None:
+        h_scale = rope_options.get("scale_y", 1.0)
+        w_scale = rope_options.get("scale_x", 1.0)
+
+        h_start = rope_options.get("shift_y", 0.0)
+        w_start = rope_options.get("shift_x", 0.0)
+    x_pos_ids = torch.zeros((batch_size, H_tokens * W_tokens, 3), dtype=torch.float32, device=device)
+    x_pos_ids[:, :, 0] = start_t
+    x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
+    x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
+    return x_pos_ids
+
+
 class NextDiT(nn.Module):
    """
    Diffusion model with a Transformer backbone.
@@ -378,10 +446,12 @@ class NextDiT(nn.Module):
        time_scale=1.0,
        pad_tokens_multiple=None,
        clip_text_dim=None,
+        siglip_feat_dim=None,
        image_model=None,
        device=None,
        dtype=None,
        operations=None,
+        **kwargs,
    ) -> None:
        super().__init__()
        self.dtype = dtype
@@ -491,6 +561,41 @@ class NextDiT(nn.Module):
                for layer_id in range(n_layers)
            ]
        )
+
+        if siglip_feat_dim is not None:
+            self.siglip_embedder = nn.Sequential(
+                operation_settings.get("operations").RMSNorm(siglip_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
+                operation_settings.get("operations").Linear(
+                    siglip_feat_dim,
+                    dim,
+                    bias=True,
+                    device=operation_settings.get("device"),
+                    dtype=operation_settings.get("dtype"),
+                ),
+            )
+            self.siglip_refiner = nn.ModuleList(
+                [
+                    JointTransformerBlock(
+                        layer_id,
+                        dim,
+                        n_heads,
+                        n_kv_heads,
+                        multiple_of,
+                        ffn_dim_multiplier,
+                        norm_eps,
+                        qk_norm,
+                        modulation=False,
+                        operation_settings=operation_settings,
+                    )
+                    for layer_id in range(n_refiner_layers)
+                ]
+            )
+            self.siglip_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
+        else:
+            self.siglip_embedder = None
+            self.siglip_refiner = None
+            self.siglip_pad_token = None
+
        # This norm final is in the lumina 2.0 code but isn't actually used for anything.
        # self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings)
@@ -531,70 +636,168 @@ class NextDiT(nn.Module):
            imgs = torch.stack(imgs, dim=0)
        return imgs

-    def patchify_and_embed(
-        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={}
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
-        bsz = len(x)
-        pH = pW = self.patch_size
-        device = x[0].device
-        orig_x = x
-
-        if self.pad_tokens_multiple is not None:
-            pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
-            cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
+    def embed_cap(self, cap_feats=None, offset=0, bsz=1, device=None, dtype=None):
+        if cap_feats is not None:
+            cap_feats = self.cap_embedder(cap_feats)
+            cap_feats_len = cap_feats.shape[1]
+            if self.pad_tokens_multiple is not None:
+                cap_feats, _ = pad_zimage(cap_feats, self.cap_pad_token, self.pad_tokens_multiple)
+        else:
+            cap_feats_len = 0
+            cap_feats = self.cap_pad_token.to(device=device, dtype=dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)

        cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
-        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
+        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0 + offset
+        embeds = (cap_feats,)
+        freqs_cis = (self.rope_embedder(cap_pos_ids).movedim(1, 2),)
+        return embeds, freqs_cis, cap_feats_len
+
+    def embed_all(self, x, cap_feats=None, siglip_feats=None, offset=0, omni=False, transformer_options={}):
+        bsz = 1
+        pH = pW = self.patch_size
+        device = x.device
+        embeds, freqs_cis, cap_feats_len = self.embed_cap(cap_feats, offset=offset, bsz=bsz, device=device, dtype=x.dtype)
+
+        if (not omni) or self.siglip_embedder is None:
+            cap_feats_len = embeds[0].shape[1] + offset
+            embeds += (None,)
+            freqs_cis += (None,)
+        else:
+            cap_feats_len += offset
+            if siglip_feats is not None:
+                b, h, w, c = siglip_feats.shape
+                siglip_feats = siglip_feats.permute(0, 3, 1, 2).reshape(b, h * w, c)
+                siglip_feats = self.siglip_embedder(siglip_feats)
+                siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
+                siglip_pos_ids[:, :, 0] = cap_feats_len + 2
+                siglip_pos_ids[:, :, 1] = (torch.linspace(0, h * 8 - 1, steps=h, dtype=torch.float32, device=device).floor()).view(-1, 1).repeat(1, w).flatten()
+                siglip_pos_ids[:, :, 2] = (torch.linspace(0, w * 8 - 1, steps=w, dtype=torch.float32, device=device).floor()).view(1, -1).repeat(h, 1).flatten()
+                if self.siglip_pad_token is not None:
+                    siglip_feats, pad_extra = pad_zimage(siglip_feats, self.siglip_pad_token, self.pad_tokens_multiple)  # TODO: double check
+                    siglip_pos_ids = torch.nn.functional.pad(siglip_pos_ids, (0, 0, 0, pad_extra))
+            else:
+                if self.siglip_pad_token is not None:
+                    siglip_feats = self.siglip_pad_token.to(device=device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
+                    siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
+
+            if siglip_feats is None:
+                embeds += (None,)
+                freqs_cis += (None,)
+            else:
+                embeds += (siglip_feats,)
+                freqs_cis += (self.rope_embedder(siglip_pos_ids).movedim(1, 2),)

        B, C, H, W = x.shape
        x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
-
-        rope_options = transformer_options.get("rope_options", None)
-        h_scale = 1.0
-        w_scale = 1.0
-        h_start = 0
-        w_start = 0
-        if rope_options is not None:
-            h_scale = rope_options.get("scale_y", 1.0)
-            w_scale = rope_options.get("scale_x", 1.0)
-
-            h_start = rope_options.get("shift_y", 0.0)
-            w_start = rope_options.get("shift_x", 0.0)
-
-        H_tokens, W_tokens = H // pH, W // pW
-        x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
-        x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
-        x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
-        x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
-
+        x_pos_ids = pos_ids_x(cap_feats_len + 1, H // pH, W // pW, bsz, device, transformer_options=transformer_options)
        if self.pad_tokens_multiple is not None:
-            pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
-            x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
+            x, pad_extra = pad_zimage(x, self.x_pad_token, self.pad_tokens_multiple)
            x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))

-        freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
+        embeds += (x,)
+        freqs_cis += (self.rope_embedder(x_pos_ids).movedim(1, 2),)
+        return embeds, freqs_cis, cap_feats_len + len(freqs_cis) - 1
+
+
+    def patchify_and_embed(
+        self, x: torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, ref_latents=[], ref_contexts=[], siglip_feats=[], transformer_options={}
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
+        bsz = x.shape[0]
+        cap_mask = None  # TODO?
+        main_siglip = None
+        orig_x = x
+
+        embeds = ([], [], [])
+        freqs_cis = ([], [], [])
+        leftover_cap = []
+
+        start_t = 0
+        omni = len(ref_latents) > 0
+        if omni:
+            for i, ref in enumerate(ref_latents):
+                if i < len(ref_contexts):
+                    ref_con = ref_contexts[i]
+                else:
+                    ref_con = None
+                if i < len(siglip_feats):
+                    sig_feat = siglip_feats[i]
+                else:
+                    sig_feat = None
+
+                out = self.embed_all(ref, ref_con, sig_feat, offset=start_t, omni=omni, transformer_options=transformer_options)
+                for i, e in enumerate(out[0]):
+                    if e is not None:
+                        embeds[i].append(comfy.utils.repeat_to_batch_size(e, bsz))
+                        freqs_cis[i].append(out[1][i])
+                start_t = out[2]
+            leftover_cap = ref_contexts[len(ref_latents):]
+
+        H, W = x.shape[-2], x.shape[-1]
+        img_sizes = [(H, W)] * bsz
+        out = self.embed_all(x, cap_feats, main_siglip, offset=start_t, omni=omni, transformer_options=transformer_options)
+        img_len = out[0][-1].shape[1]
+        cap_len = out[0][0].shape[1]
+        for i, e in enumerate(out[0]):
+            if e is not None:
+                e = comfy.utils.repeat_to_batch_size(e, bsz)
+                embeds[i].append(e)
+                freqs_cis[i].append(out[1][i])
+        start_t = out[2]
+
+        for cap in leftover_cap:
+            out = self.embed_cap(cap, offset=start_t, bsz=bsz, device=x.device, dtype=x.dtype)
+            cap_len += out[0][0].shape[1]
+            embeds[0].append(comfy.utils.repeat_to_batch_size(out[0][0], bsz))
+            freqs_cis[0].append(out[1][0])
+            start_t += out[2]

        patches = transformer_options.get("patches", {})

        # refine context
+        cap_feats = torch.cat(embeds[0], dim=1)
+        cap_freqs_cis = torch.cat(freqs_cis[0], dim=1)
        for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
+            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options)
+
+        feats = (cap_feats,)
+        fc = (cap_freqs_cis,)
+
+        if omni and len(embeds[1]) > 0:
+            siglip_mask = None
+            siglip_feats_combined = torch.cat(embeds[1], dim=1)
+            siglip_feats_freqs_cis = torch.cat(freqs_cis[1], dim=1)
+            if self.siglip_refiner is not None:
+                for layer in self.siglip_refiner:
+                    siglip_feats_combined = layer(siglip_feats_combined, siglip_mask, siglip_feats_freqs_cis, transformer_options=transformer_options)
+            feats += (siglip_feats_combined,)
+            fc += (siglip_feats_freqs_cis,)

        padded_img_mask = None
+        x = torch.cat(embeds[-1], dim=1)
+        fc_x = torch.cat(freqs_cis[-1], dim=1)
+        if omni:
+            timestep_zero_index = [(x.shape[1] - img_len, x.shape[1])]
+        else:
+            timestep_zero_index = None
+
        x_input = x
        for i, layer in enumerate(self.noise_refiner):
-            x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
+            x = layer(x, padded_img_mask, fc_x, t, timestep_zero_index=timestep_zero_index, transformer_options=transformer_options)
            if "noise_refiner" in patches:
                for p in patches["noise_refiner"]:
-                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
+                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": fc_x, "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
                    if "img" in out:
                        x = out["img"]

-        padded_full_embed = torch.cat((cap_feats, x), dim=1)
+        padded_full_embed = torch.cat(feats + (x,), dim=1)
+        if timestep_zero_index is not None:
+            ind = padded_full_embed.shape[1] - x.shape[1]
+            timestep_zero_index = [(ind + x.shape[1] - img_len, ind + x.shape[1])]
+            timestep_zero_index.append((feats[0].shape[1] - cap_len, feats[0].shape[1]))
+
        mask = None
-        img_sizes = [(H, W)] * bsz
-        l_effective_cap_len = [cap_feats.shape[1]] * bsz
-        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
+        l_effective_cap_len = [padded_full_embed.shape[1] - img_len] * bsz
+        return padded_full_embed, mask, img_sizes, l_effective_cap_len, torch.cat(fc + (fc_x,), dim=1), timestep_zero_index

    def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
@@ -604,7 +807,11 @@ class NextDiT(nn.Module):
        ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)

    # def forward(self, x, t, cap_feats, cap_mask):
-    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, ref_latents=[], ref_contexts=[], siglip_feats=[], transformer_options={}, **kwargs):
+        omni = len(ref_latents) > 0
+        if omni:
+            timesteps = torch.cat([timesteps * 0, timesteps], dim=0)
+
        t = 1.0 - timesteps
        cap_feats = context
        cap_mask = attention_mask
@@ -619,8 +826,6 @@ class NextDiT(nn.Module):
        t = self.t_embedder(t * self.time_scale, dtype=x.dtype)  # (N, D)
        adaln_input = t

-        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
-
        if self.clip_text_pooled_proj is not None:
            pooled = kwargs.get("clip_text_pooled", None)
            if pooled is not None:
@@ -632,7 +837,7 @@ class NextDiT(nn.Module):

        patches = transformer_options.get("patches", {})
        x_is_tensor = isinstance(x, torch.Tensor)
-        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
+        img, mask, img_size, cap_size, freqs_cis, timestep_zero_index = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, ref_latents=ref_latents, ref_contexts=ref_contexts, siglip_feats=siglip_feats, transformer_options=transformer_options)
        freqs_cis = freqs_cis.to(img.device)

        transformer_options["total_blocks"] = len(self.layers)
@@ -640,7 +845,7 @@ class NextDiT(nn.Module):
        img_input = img
        for i, layer in enumerate(self.layers):
            transformer_options["block_index"] = i
-            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+            img = layer(img, mask, freqs_cis, adaln_input, timestep_zero_index=timestep_zero_index, transformer_options=transformer_options)
            if "double_block" in patches:
                for p in patches["double_block"]:
                    out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
@@ -649,8 +854,7 @@ class NextDiT(nn.Module):
                    if "txt" in out:
                        img[:, :cap_size[0]] = out["txt"]

-        img = self.final_layer(img, adaln_input)
+        img = self.final_layer(img, adaln_input, timestep_zero_index=timestep_zero_index)
        img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
-
        return -img

@@ -14,10 +14,13 @@ if model_management.xformers_enabled_vae():
    import xformers.ops

 def torch_cat_if_needed(xl, dim):
+    xl = [x for x in xl if x is not None and x.shape[dim] > 0]
    if len(xl) > 1:
        return torch.cat(xl, dim)
-    else:
+    elif len(xl) == 1:
        return xl[0]
+    else:
+        return None

 def get_timestep_embedding(timesteps, embedding_dim):
    """
@@ -170,8 +170,14 @@ class Attention(nn.Module):
        joint_query = apply_rope1(joint_query, image_rotary_emb)
        joint_key = apply_rope1(joint_key, image_rotary_emb)

+        if encoder_hidden_states_mask is not None:
+            attn_mask = torch.zeros((batch_size, 1, seq_txt + seq_img), dtype=hidden_states.dtype, device=hidden_states.device)
+            attn_mask[:, 0, :seq_txt] = encoder_hidden_states_mask
+        else:
+            attn_mask = None
+
        joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads,
-                                                         attention_mask, transformer_options=transformer_options,
+                                                         attn_mask, transformer_options=transformer_options,
                                                         skip_reshape=True)

        txt_attn_output = joint_hidden_states[:, :seq_txt, :]
@@ -430,6 +436,9 @@ class QwenImageTransformer2DModel(nn.Module):
        encoder_hidden_states = context
        encoder_hidden_states_mask = attention_mask

+        if encoder_hidden_states_mask is not None and not torch.is_floating_point(encoder_hidden_states_mask):
+            encoder_hidden_states_mask = (encoder_hidden_states_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max
+
        hidden_states, img_ids, orig_shape = self.process_img(x)
        num_embeds = hidden_states.shape[1]

@@ -62,6 +62,8 @@ class WanSelfAttention(nn.Module):
            x(Tensor): Shape [B, L, num_heads, C / num_heads]
            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
        """
+        patches = transformer_options.get("patches", {})
+
        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim

        def qkv_fn_q(x):
@@ -86,6 +88,10 @@ class WanSelfAttention(nn.Module):
            transformer_options=transformer_options,
        )

+        if "attn1_patch" in patches:
+            for p in patches["attn1_patch"]:
+                x = p({"x": x, "q": q, "k": k, "transformer_options": transformer_options})
+
        x = self.o(x)
        return x

@@ -225,6 +231,8 @@ class WanAttentionBlock(nn.Module):
        """
        # assert e.dtype == torch.float32

+        patches = transformer_options.get("patches", {})
+
        if e.ndim < 4:
            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
        else:
@@ -242,6 +250,11 @@ class WanAttentionBlock(nn.Module):

        # cross-attention & ffn
        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options)
+
+        if "attn2_patch" in patches:
+            for p in patches["attn2_patch"]:
+                x = p({"x": x, "transformer_options": transformer_options})
+
        y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x)))
        x = torch.addcmul(x, y, repeat_e(e[5], x))
        return x
@@ -488,7 +501,7 @@ class WanModel(torch.nn.Module):
        self.blocks = nn.ModuleList([
            wan_attn_block_class(cross_attn_type, dim, ffn_dim, num_heads,
                                 window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
-            for _ in range(num_layers)
+            for i in range(num_layers)
        ])

        # head
@@ -541,6 +554,7 @@ class WanModel(torch.nn.Module):
        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
        grid_sizes = x.shape[2:]
+        transformer_options["grid_sizes"] = grid_sizes
        x = x.flatten(2).transpose(1, 2)

        # time embeddings
@@ -738,6 +752,7 @@ class VaceWanModel(WanModel):
        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
        grid_sizes = x.shape[2:]
+        transformer_options["grid_sizes"] = grid_sizes
        x = x.flatten(2).transpose(1, 2)

        # time embeddings
@@ -0,0 +1,500 @@
+import torch
+from einops import rearrange, repeat
+import comfy
+from comfy.ldm.modules.attention import optimized_attention
+
+
+def calculate_x_ref_attn_map(visual_q, ref_k, ref_target_masks, split_num=8):
+    scale = 1.0 / visual_q.shape[-1] ** 0.5
+    visual_q = visual_q.transpose(1, 2) * scale
+
+    B, H, x_seqlens, K = visual_q.shape
+
+    x_ref_attn_maps = []
+    for class_idx, ref_target_mask in enumerate(ref_target_masks):
+        ref_target_mask = ref_target_mask.view(1, 1, 1, -1)
+
+        x_ref_attnmap = torch.zeros(B, H, x_seqlens, device=visual_q.device, dtype=visual_q.dtype)
+        chunk_size = min(max(x_seqlens // split_num, 1), x_seqlens)
+
+        for i in range(0, x_seqlens, chunk_size):
+            end_i = min(i + chunk_size, x_seqlens)
+
+            attn_chunk = visual_q[:, :, i:end_i] @ ref_k.permute(0, 2, 3, 1)  # B, H, chunk, ref_seqlens
+
+            # Apply softmax
+            attn_max = attn_chunk.max(dim=-1, keepdim=True).values
+            attn_chunk = (attn_chunk - attn_max).exp()
+            attn_sum = attn_chunk.sum(dim=-1, keepdim=True)
+            attn_chunk = attn_chunk / (attn_sum + 1e-8)
+
+            # Apply mask and sum
+            masked_attn = attn_chunk * ref_target_mask
+            x_ref_attnmap[:, :, i:end_i] = masked_attn.sum(-1) / (ref_target_mask.sum() + 1e-8)
+
+            del attn_chunk, masked_attn
+
+        # Average across heads
+        x_ref_attnmap = x_ref_attnmap.mean(dim=1)  # B, x_seqlens
+        x_ref_attn_maps.append(x_ref_attnmap)
+
+    del visual_q, ref_k
+
+    return torch.cat(x_ref_attn_maps, dim=0)
+
+def get_attn_map_with_target(visual_q, ref_k, shape, ref_target_masks=None, split_num=2):
+    """Args:
+        query (torch.tensor): B M H K
+        key (torch.tensor): B M H K
+        shape (tuple): (N_t, N_h, N_w)
+        ref_target_masks: [B, N_h * N_w]
+    """
+
+    N_t, N_h, N_w = shape
+
+    x_seqlens = N_h * N_w
+    ref_k     = ref_k[:, :x_seqlens]
+    _, seq_lens, heads, _ = visual_q.shape
+    class_num, _ = ref_target_masks.shape
+    x_ref_attn_maps = torch.zeros(class_num, seq_lens).to(visual_q)
+
+    split_chunk = heads // split_num
+
+    for i in range(split_num):
+        x_ref_attn_maps_perhead = calculate_x_ref_attn_map(
+            visual_q[:, :, i*split_chunk:(i+1)*split_chunk, :],
+            ref_k[:, :, i*split_chunk:(i+1)*split_chunk, :],
+            ref_target_masks
+            )
+        x_ref_attn_maps += x_ref_attn_maps_perhead
+
+    return x_ref_attn_maps / split_num
+
+
+def normalize_and_scale(column, source_range, target_range, epsilon=1e-8):
+    source_min, source_max = source_range
+    new_min, new_max = target_range
+    normalized = (column - source_min) / (source_max - source_min + epsilon)
+    scaled = normalized * (new_max - new_min) + new_min
+    return scaled
+
+
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+
+
+def get_audio_embeds(encoded_audio, audio_start, audio_end):
+    audio_embs = []
+    human_num = len(encoded_audio)
+    audio_frames = encoded_audio[0].shape[0]
+
+    indices = (torch.arange(4 + 1) - 2) * 1
+
+    for human_idx in range(human_num):
+        if audio_end > audio_frames: # in case of not enough audio for current window, pad with first audio frame as that's most likely silence
+            pad_len = audio_end - audio_frames
+            pad_shape = list(encoded_audio[human_idx].shape)
+            pad_shape[0] = pad_len
+            pad_tensor = encoded_audio[human_idx][:1].repeat(pad_len, *([1] * (encoded_audio[human_idx].dim() - 1)))
+            encoded_audio_in = torch.cat([encoded_audio[human_idx], pad_tensor], dim=0)
+        else:
+            encoded_audio_in = encoded_audio[human_idx]
+        center_indices = torch.arange(audio_start, audio_end, 1).unsqueeze(1) + indices.unsqueeze(0)
+        center_indices = torch.clamp(center_indices, min=0, max=encoded_audio_in.shape[0] - 1)
+        audio_emb = encoded_audio_in[center_indices].unsqueeze(0)
+        audio_embs.append(audio_emb)
+
+    return torch.cat(audio_embs, dim=0)
+
+
+def project_audio_features(audio_proj, encoded_audio, audio_start, audio_end):
+    audio_embs = get_audio_embeds(encoded_audio, audio_start, audio_end)
+
+    first_frame_audio_emb_s = audio_embs[:, :1, ...]
+    latter_frame_audio_emb = audio_embs[:, 1:, ...]
+    latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=4)
+
+    middle_index = audio_proj.seq_len // 2
+
+    latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...]
+    latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+    latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...]
+    latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+    latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...]
+    latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+    latter_frame_audio_emb_s = torch.cat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2)
+
+    audio_emb = audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s)
+    audio_emb = torch.cat(audio_emb.split(1), dim=2)
+
+    return audio_emb
+
+
+class RotaryPositionalEmbedding1D(torch.nn.Module):
+    def __init__(self,
+                 head_dim,
+                 ):
+        super().__init__()
+        self.head_dim = head_dim
+        self.base = 10000
+
+    def precompute_freqs_cis_1d(self, pos_indices):
+        freqs = 1.0 / (self.base ** (torch.arange(0, self.head_dim, 2)[: (self.head_dim // 2)].float() / self.head_dim))
+        freqs = freqs.to(pos_indices.device)
+        freqs = torch.einsum("..., f -> ... f", pos_indices.float(), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        return freqs
+
+    def forward(self, x, pos_indices):
+        freqs_cis = self.precompute_freqs_cis_1d(pos_indices)
+
+        x_ = x.float()
+
+        freqs_cis = freqs_cis.float().to(x.device)
+        cos, sin = freqs_cis.cos(), freqs_cis.sin()
+        cos, sin = rearrange(cos, 'n d -> 1 1 n d'), rearrange(sin, 'n d -> 1 1 n d')
+        x_ = (x_ * cos) + (rotate_half(x_) * sin)
+
+        return x_.type_as(x)
+
+class SingleStreamAttention(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        encoder_hidden_states_dim: int,
+        num_heads: int,
+        qkv_bias: bool,
+        device=None, dtype=None, operations=None
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.encoder_hidden_states_dim = encoder_hidden_states_dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+
+        self.q_linear = operations.Linear(dim, dim, bias=qkv_bias, device=device, dtype=dtype)
+        self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+        self.kv_linear = operations.Linear(encoder_hidden_states_dim, dim * 2, bias=qkv_bias, device=device, dtype=dtype)
+
+    def forward(self, x: torch.Tensor, encoder_hidden_states: torch.Tensor, shape=None) -> torch.Tensor:
+        N_t, N_h, N_w = shape
+
+        expected_tokens = N_t * N_h * N_w
+        actual_tokens = x.shape[1]
+        x_extra = None
+
+        if actual_tokens != expected_tokens:
+            x_extra = x[:, -N_h * N_w:, :]
+            x = x[:, :-N_h * N_w, :]
+            N_t = N_t - 1
+
+        B = x.shape[0]
+        S = N_h * N_w
+        x = x.view(B * N_t, S, self.dim)
+
+        # get q for hidden_state
+        q = self.q_linear(x).view(B * N_t, S, self.num_heads, self.head_dim)
+
+        # get kv from encoder_hidden_states # shape: (B, N, num_heads, head_dim)
+        kv = self.kv_linear(encoder_hidden_states)
+        encoder_k, encoder_v = kv.view(B * N_t, encoder_hidden_states.shape[1], 2, self.num_heads, self.head_dim).unbind(2)
+
+        #print("q.shape", q.shape) #torch.Size([21, 1024, 40, 128])
+        x = optimized_attention(
+            q.transpose(1, 2),
+            encoder_k.transpose(1, 2),
+            encoder_v.transpose(1, 2),
+            heads=self.num_heads, skip_reshape=True, skip_output_reshape=True).transpose(1, 2)
+
+        # linear transform
+        x = self.proj(x.reshape(B * N_t, S, self.dim))
+        x = x.view(B, N_t * S, self.dim)
+
+        if x_extra is not None:
+            x = torch.cat([x, torch.zeros_like(x_extra)], dim=1)
+
+        return x
+
+class SingleStreamMultiAttention(SingleStreamAttention):
+    def __init__(
+        self,
+        dim: int,
+        encoder_hidden_states_dim: int,
+        num_heads: int,
+        qkv_bias: bool,
+        class_range: int = 24,
+        class_interval: int = 4,
+        device=None, dtype=None, operations=None
+    ) -> None:
+        super().__init__(
+            dim=dim,
+            encoder_hidden_states_dim=encoder_hidden_states_dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            device=device,
+            dtype=dtype,
+            operations=operations
+        )
+
+        # Rotary-embedding layout parameters
+        self.class_interval = class_interval
+        self.class_range = class_range
+        self.max_humans = self.class_range // self.class_interval
+
+        # Constant bucket used for background tokens
+        self.rope_bak = int(self.class_range // 2)
+
+        self.rope_1d = RotaryPositionalEmbedding1D(self.head_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        shape=None,
+        x_ref_attn_map=None
+    ) -> torch.Tensor:
+        encoder_hidden_states = encoder_hidden_states.squeeze(0).to(x.device)
+        human_num = x_ref_attn_map.shape[0] if x_ref_attn_map is not None else 1
+        # Single-speaker fall-through
+        if human_num <= 1:
+            return super().forward(x, encoder_hidden_states, shape)
+
+        N_t, N_h, N_w = shape
+
+        x_extra = None
+        if x.shape[0] * N_t != encoder_hidden_states.shape[0]:
+            x_extra = x[:, -N_h * N_w:, :]
+            x = x[:, :-N_h * N_w, :]
+            N_t = N_t - 1
+        x = rearrange(x, "B (N_t S) C -> (B N_t) S C", N_t=N_t)
+
+        # Query projection
+        B, N, C = x.shape
+        q = self.q_linear(x)
+        q = q.view(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+
+        # Use `class_range` logic for 2 speakers
+        rope_h1 = (0, self.class_interval)
+        rope_h2 = (self.class_range - self.class_interval, self.class_range)
+        rope_bak = int(self.class_range // 2)
+
+        # Normalize and scale attention maps for each speaker
+        max_values = x_ref_attn_map.max(1).values[:, None, None]
+        min_values = x_ref_attn_map.min(1).values[:, None, None]
+        max_min_values = torch.cat([max_values, min_values], dim=2)
+
+        human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min()
+        human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min()
+
+        human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), rope_h1)
+        human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), rope_h2)
+        back = torch.full((x_ref_attn_map.size(1),), rope_bak, dtype=human1.dtype, device=human1.device)
+
+        # Token-wise speaker dominance
+        max_indices = x_ref_attn_map.argmax(dim=0)
+        normalized_map = torch.stack([human1, human2, back], dim=1)
+        normalized_pos = normalized_map[torch.arange(x_ref_attn_map.size(1)), max_indices]
+
+        # Apply rotary to Q
+        q = rearrange(q, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t)
+        q = self.rope_1d(q, normalized_pos)
+        q = rearrange(q, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t)
+
+        # Keys / Values
+        _, N_a, _ = encoder_hidden_states.shape
+        encoder_kv = self.kv_linear(encoder_hidden_states)
+        encoder_kv = encoder_kv.view(B, N_a, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        encoder_k, encoder_v = encoder_kv.unbind(0)
+
+        # Rotary for keys – assign centre of each speaker bucket to its context tokens
+        per_frame = torch.zeros(N_a, dtype=encoder_k.dtype, device=encoder_k.device)
+        per_frame[: per_frame.size(0) // 2] = (rope_h1[0] + rope_h1[1]) / 2
+        per_frame[per_frame.size(0) // 2 :] = (rope_h2[0] + rope_h2[1]) / 2
+        encoder_pos = torch.cat([per_frame] * N_t, dim=0)
+
+        encoder_k = rearrange(encoder_k, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t)
+        encoder_k = self.rope_1d(encoder_k, encoder_pos)
+        encoder_k = rearrange(encoder_k, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t)
+
+        # Final attention
+        q = rearrange(q, "B H M K -> B M H K")
+        encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
+        encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
+
+        x = optimized_attention(
+            q.transpose(1, 2),
+            encoder_k.transpose(1, 2),
+            encoder_v.transpose(1, 2),
+            heads=self.num_heads, skip_reshape=True, skip_output_reshape=True).transpose(1, 2)
+
+        # Linear projection
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+
+        # Restore original layout
+        x = rearrange(x, "(B N_t) S C -> B (N_t S) C", N_t=N_t)
+        if x_extra is not None:
+            x = torch.cat([x, torch.zeros_like(x_extra)], dim=1)
+
+        return x
+
+
+class MultiTalkAudioProjModel(torch.nn.Module):
+    def __init__(
+        self,
+        seq_len: int = 5,
+        seq_len_vf: int = 12,
+        blocks: int = 12,
+        channels: int = 768,
+        intermediate_dim: int = 512,
+        out_dim: int = 768,
+        context_tokens: int = 32,
+        device=None, dtype=None, operations=None
+    ):
+        super().__init__()
+
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = seq_len * blocks * channels
+        self.input_dim_vf = seq_len_vf * blocks * channels
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.out_dim = out_dim
+
+        # define multiple linear layers
+        self.proj1 = operations.Linear(self.input_dim, intermediate_dim, device=device, dtype=dtype)
+        self.proj1_vf = operations.Linear(self.input_dim_vf, intermediate_dim, device=device, dtype=dtype)
+        self.proj2 = operations.Linear(intermediate_dim, intermediate_dim, device=device, dtype=dtype)
+        self.proj3 = operations.Linear(intermediate_dim, context_tokens * out_dim, device=device, dtype=dtype)
+        self.norm = operations.LayerNorm(out_dim, device=device, dtype=dtype)
+
+    def forward(self, audio_embeds, audio_embeds_vf):
+        video_length = audio_embeds.shape[1] + audio_embeds_vf.shape[1]
+        B, _, _, S, C = audio_embeds.shape
+
+        # process audio of first frame
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+
+        # process audio of latter frame
+        audio_embeds_vf = rearrange(audio_embeds_vf, "bz f w b c -> (bz f) w b c")
+        batch_size_vf, window_size_vf, blocks_vf, channels_vf = audio_embeds_vf.shape
+        audio_embeds_vf = audio_embeds_vf.view(batch_size_vf, window_size_vf * blocks_vf * channels_vf)
+
+        # first projection
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds_vf = torch.relu(self.proj1_vf(audio_embeds_vf))
+        audio_embeds = rearrange(audio_embeds, "(bz f) c -> bz f c", bz=B)
+        audio_embeds_vf = rearrange(audio_embeds_vf, "(bz f) c -> bz f c", bz=B)
+        audio_embeds_c = torch.concat([audio_embeds, audio_embeds_vf], dim=1)
+        batch_size_c, N_t, C_a = audio_embeds_c.shape
+        audio_embeds_c = audio_embeds_c.view(batch_size_c*N_t, C_a)
+
+        # second projection
+        audio_embeds_c = torch.relu(self.proj2(audio_embeds_c))
+
+        context_tokens = self.proj3(audio_embeds_c).reshape(batch_size_c*N_t, self.context_tokens, self.out_dim)
+
+        # normalization and reshape
+        context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length)
+
+        return context_tokens
+
+
+class WanMultiTalkAttentionBlock(torch.nn.Module):
+    def __init__(self, in_dim=5120, out_dim=768, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.audio_cross_attn = SingleStreamMultiAttention(in_dim, out_dim, num_heads=40, qkv_bias=True, device=device, dtype=dtype, operations=operations)
+        self.norm_x = operations.LayerNorm(in_dim, device=device, dtype=dtype, elementwise_affine=True)
+
+
+class MultiTalkGetAttnMapPatch:
+    def __init__(self, ref_target_masks=None):
+        self.ref_target_masks = ref_target_masks
+
+    def __call__(self, kwargs):
+        transformer_options = kwargs.get("transformer_options", {})
+        x = kwargs["x"]
+
+        if self.ref_target_masks is not None:
+            x_ref_attn_map = get_attn_map_with_target(kwargs["q"], kwargs["k"], transformer_options["grid_sizes"], ref_target_masks=self.ref_target_masks.to(x.device))
+            transformer_options["x_ref_attn_map"] = x_ref_attn_map
+        return x
+
+
+class MultiTalkCrossAttnPatch:
+    def __init__(self, model_patch, audio_scale=1.0, ref_target_masks=None):
+        self.model_patch = model_patch
+        self.audio_scale = audio_scale
+        self.ref_target_masks = ref_target_masks
+
+    def __call__(self, kwargs):
+        transformer_options = kwargs.get("transformer_options", {})
+        block_idx = transformer_options.get("block_index", None)
+        x = kwargs["x"]
+        if block_idx is None:
+            return torch.zeros_like(x)
+
+        audio_embeds = transformer_options.get("audio_embeds")
+        x_ref_attn_map = transformer_options.pop("x_ref_attn_map", None)
+
+        norm_x = self.model_patch.model.blocks[block_idx].norm_x(x)
+        x_audio = self.model_patch.model.blocks[block_idx].audio_cross_attn(
+            norm_x, audio_embeds.to(x.dtype),
+            shape=transformer_options["grid_sizes"],
+            x_ref_attn_map=x_ref_attn_map
+        )
+        x = x + x_audio * self.audio_scale
+        return x
+
+    def models(self):
+        return [self.model_patch]
+
+class MultiTalkApplyModelWrapper:
+    def __init__(self, init_latents):
+        self.init_latents = init_latents
+
+    def __call__(self, executor, x, *args, **kwargs):
+        x[:, :, :self.init_latents.shape[2]] = self.init_latents.to(x)
+        samples = executor(x, *args, **kwargs)
+        return samples
+
+
+class InfiniteTalkOuterSampleWrapper:
+    def __init__(self, motion_frames_latent, model_patch, is_extend=False):
+        self.motion_frames_latent = motion_frames_latent
+        self.model_patch = model_patch
+        self.is_extend = is_extend
+
+    def __call__(self, executor, *args, **kwargs):
+        model_patcher = executor.class_obj.model_patcher
+        model_options = executor.class_obj.model_options
+        process_latent_in = model_patcher.model.process_latent_in
+
+        # for InfiniteTalk, model input first latent(s) need to always be replaced on every step
+        if self.motion_frames_latent is not None:
+            wrappers = model_options["transformer_options"]["wrappers"]
+            w = wrappers.setdefault(comfy.patcher_extension.WrappersMP.APPLY_MODEL, {})
+            w["MultiTalk_apply_model"] = [MultiTalkApplyModelWrapper(process_latent_in(self.motion_frames_latent))]
+
+        # run the sampling process
+        result = executor(*args, **kwargs)
+
+        # insert motion frames before decoding
+        if self.is_extend:
+            overlap = self.motion_frames_latent.shape[2]
+            result = torch.cat([self.motion_frames_latent.to(result), result[:, :, overlap:]], dim=2)
+
+        return result
+
+    def to(self, device_or_dtype):
+        if isinstance(device_or_dtype, torch.device):
+            if self.motion_frames_latent is not None:
+                self.motion_frames_latent = self.motion_frames_latent.to(device_or_dtype)
+        return self
@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from comfy.ldm.modules.diffusionmodules.model import vae_attention
+from comfy.ldm.modules.diffusionmodules.model import vae_attention, torch_cat_if_needed

 import comfy.ops
 ops = comfy.ops.disable_weight_init
@@ -20,22 +20,29 @@ class CausalConv3d(ops.Conv3d):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self._padding = (self.padding[2], self.padding[2], self.padding[1],
-                         self.padding[1], 2 * self.padding[0], 0)
-        self.padding = (0, 0, 0)
+        self._padding = 2 * self.padding[0]
+        self.padding = (0, self.padding[1], self.padding[2])

    def forward(self, x, cache_x=None, cache_list=None, cache_idx=None):
        if cache_list is not None:
            cache_x = cache_list[cache_idx]
            cache_list[cache_idx] = None

-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
+        if cache_x is None and x.shape[2] == 1:
+            #Fast path - the op will pad for use by truncating the weight
+            #and save math on a pile of zeros.
+            return super().forward(x, autopad="causal_zero")
+
+        if self._padding > 0:
+            padding_needed = self._padding
+            if cache_x is not None:
+                cache_x = cache_x.to(x.device)
+                padding_needed = max(0, padding_needed - cache_x.shape[2])
+            padding_shape = list(x.shape)
+            padding_shape[2] = padding_needed
+            padding = torch.zeros(padding_shape, device=x.device, dtype=x.dtype)
+            x = torch_cat_if_needed([padding, cache_x, x], dim=2)
            del cache_x
-        x = F.pad(x, padding)

        return super().forward(x)

@@ -472,10 +479,12 @@ class WanVAE(nn.Module):

    def encode(self, x):
        conv_idx = [0]
-        feat_map = [None] * count_conv3d(self.decoder)
        ## cache
        t = x.shape[2]
        iter_ = 1 + (t - 1) // 4
+        feat_map = None
+        if iter_ > 1:
+            feat_map = [None] * count_conv3d(self.decoder)
        ## 对encode输入的x，按时间拆分为1、4、4、4....
        for i in range(iter_):
            conv_idx = [0]
@@ -495,10 +504,11 @@ class WanVAE(nn.Module):

    def decode(self, z):
        conv_idx = [0]
-        feat_map = [None] * count_conv3d(self.decoder)
        # z: [b,c,t,h,w]
-
        iter_ = z.shape[2]
+        feat_map = None
+        if iter_ > 1:
+            feat_map = [None] * count_conv3d(self.decoder)
        x = self.conv2(z)
        for i in range(iter_):
            conv_idx = [0]
@@ -260,6 +260,7 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format
                key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris
                key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer
+                key_map[k[:-len(".weight")]] = to #DiffSynth lora format
        for k in sdk:
            hidden_size = model.model_config.unet_config.get("hidden_size", 0)
            if k.endswith(".weight") and ".linear1." in k:
@@ -331,6 +332,12 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["{}".format(key_lora)] = k
                key_map["transformer.{}".format(key_lora)] = k

+    if isinstance(model, comfy.model_base.ACEStep15):
+        for k in sdk:
+            if k.startswith("diffusion_model.decoder.") and k.endswith(".weight"):
+                key_lora = k[len("diffusion_model.decoder."):-len(".weight")]
+                key_map["base_model.model.{}".format(key_lora)] = k  # Official base model loras
+
    return key_map


@@ -0,0 +1,81 @@
+import math
+import torch
+from typing import NamedTuple
+
+from comfy.quant_ops import QuantizedTensor
+
+class TensorGeometry(NamedTuple):
+    shape: any
+    dtype: torch.dtype
+
+    def element_size(self):
+        info = torch.finfo(self.dtype) if self.dtype.is_floating_point else torch.iinfo(self.dtype)
+        return info.bits // 8
+
+    def numel(self):
+        return math.prod(self.shape)
+
+def tensors_to_geometries(tensors, dtype=None):
+    geometries = []
+    for t in tensors:
+        if t is None or isinstance(t, QuantizedTensor):
+            geometries.append(t)
+            continue
+        tdtype = t.dtype
+        if hasattr(t, "_model_dtype"):
+            tdtype = t._model_dtype
+        if dtype is not None:
+            tdtype = dtype
+        geometries.append(TensorGeometry(shape=t.shape, dtype=tdtype))
+    return geometries
+
+def vram_aligned_size(tensor):
+    if isinstance(tensor, list):
+        return sum([vram_aligned_size(t) for t in tensor])
+
+    if isinstance(tensor, QuantizedTensor):
+        inner_tensors, _ = tensor.__tensor_flatten__()
+        return vram_aligned_size([ getattr(tensor, attr) for attr in inner_tensors ])
+
+    if tensor is None:
+        return 0
+
+    size = tensor.numel() * tensor.element_size()
+    aligment_req = 1024
+    return (size + aligment_req - 1) // aligment_req * aligment_req
+
+def interpret_gathered_like(tensors, gathered):
+    offset = 0
+    dest_views = []
+
+    if gathered.dim() != 1 or gathered.element_size() != 1:
+        raise ValueError(f"Buffer must be 1D and single-byte (got {gathered.dim()}D {gathered.dtype})")
+
+    for tensor in tensors:
+
+        if tensor is None:
+            dest_views.append(None)
+            continue
+
+        if isinstance(tensor, QuantizedTensor):
+            inner_tensors, qt_ctx = tensor.__tensor_flatten__()
+            templates = { attr: getattr(tensor, attr) for attr in inner_tensors }
+        else:
+            templates = { "data": tensor }
+
+        actuals = {}
+        for attr, template in templates.items():
+            size = template.numel() * template.element_size()
+            if offset + size > gathered.numel():
+                raise ValueError(f"Buffer too small: needs {offset + size} bytes, but only has {gathered.numel()}. ")
+            actuals[attr] = gathered[offset:offset+size].view(dtype=template.dtype).view(template.shape)
+            offset += vram_aligned_size(template)
+
+        if isinstance(tensor, QuantizedTensor):
+            dest_views.append(QuantizedTensor.__tensor_unflatten__(actuals, qt_ctx, 0, 0))
+        else:
+            dest_views.append(actuals["data"])
+
+    return dest_views
+
+aimdo_allocator = None
@@ -49,6 +49,8 @@ import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
 import comfy.ldm.qwen_image.model
 import comfy.ldm.kandinsky5.model
+import comfy.ldm.anima.model
+import comfy.ldm.ace.ace_step15

 import comfy.model_management
 import comfy.patcher_extension
@@ -148,6 +150,8 @@ class BaseModel(torch.nn.Module):
        self.model_type = model_type
        self.model_sampling = model_sampling(model_config, model_type)

+        comfy.model_management.archive_model_dtypes(self.diffusion_model)
+
        self.adm_channels = unet_config.get("adm_in_channels", None)
        if self.adm_channels is None:
            self.adm_channels = 0
@@ -298,7 +302,7 @@ class BaseModel(torch.nn.Module):

        return out

-    def load_model_weights(self, sd, unet_prefix=""):
+    def load_model_weights(self, sd, unet_prefix="", assign=False):
        to_load = {}
        keys = list(sd.keys())
        for k in keys:
@@ -306,7 +310,7 @@ class BaseModel(torch.nn.Module):
                to_load[k[len(unet_prefix):]] = sd.pop(k)

        to_load = self.model_config.process_unet_state_dict(to_load)
-        m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
+        m, u = self.diffusion_model.load_state_dict(to_load, strict=False, assign=assign)
        if len(m) > 0:
            logging.warning("unet missing: {}".format(m))

@@ -321,7 +325,7 @@ class BaseModel(torch.nn.Module):
    def process_latent_out(self, latent):
        return self.latent_format.process_out(latent)

-    def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
+    def state_dict_for_saving(self, unet_state_dict, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
        extra_sds = []
        if clip_state_dict is not None:
            extra_sds.append(self.model_config.process_clip_state_dict_for_saving(clip_state_dict))
@@ -329,10 +333,7 @@ class BaseModel(torch.nn.Module):
            extra_sds.append(self.model_config.process_vae_state_dict_for_saving(vae_state_dict))
        if clip_vision_state_dict is not None:
            extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
-
-        unet_state_dict = self.diffusion_model.state_dict()
        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
-
        if self.model_type == ModelType.V_PREDICTION:
            unet_state_dict["v_pred"] = torch.tensor([])

@@ -775,8 +776,8 @@ class StableAudio1(BaseModel):
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out

-    def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
-        sd = super().state_dict_for_saving(clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
+    def state_dict_for_saving(self, unet_state_dict, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
+        sd = super().state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
        d = {"conditioner.conditioners.seconds_start.": self.seconds_start_embedder.state_dict(), "conditioner.conditioners.seconds_total.": self.seconds_total_embedder.state_dict()}
        for k in d:
            s = d[k]
@@ -1147,9 +1148,31 @@ class CosmosPredict2(BaseModel):
        sigma = (sigma / (sigma + 1))
        return latent_image / (1.0 - sigma)

+class Anima(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.anima.model.Anima)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        t5xxl_ids = kwargs.get("t5xxl_ids", None)
+        t5xxl_weights = kwargs.get("t5xxl_weights", None)
+        device = kwargs["device"]
+        if cross_attn is not None:
+            if t5xxl_ids is not None:
+                cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.unsqueeze(0).to(device=device))
+                if t5xxl_weights is not None:
+                    cross_attn *= t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn)
+
+                if cross_attn.shape[1] < 512:
+                    cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, 0, 512 - cross_attn.shape[1]))
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        return out
+
 class Lumina2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
+        self.memory_usage_factor_conds = ("ref_latents",)

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
@@ -1169,6 +1192,35 @@ class Lumina2(BaseModel):
        if clip_text_pooled is not None:
            out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)

+        clip_vision_outputs = kwargs.get("clip_vision_outputs", list(map(lambda a: a.get("clip_vision_output"), kwargs.get("unclip_conditioning", [{}]))))  # Z Image omni
+        if clip_vision_outputs is not None and len(clip_vision_outputs) > 0:
+            sigfeats = []
+            for clip_vision_output in clip_vision_outputs:
+                if clip_vision_output is not None:
+                    image_size = clip_vision_output.image_sizes[0]
+                    shape = clip_vision_output.last_hidden_state.shape
+                    sigfeats.append(clip_vision_output.last_hidden_state.reshape(shape[0], image_size[1] // 16, image_size[2] // 16, shape[-1]))
+            if len(sigfeats) > 0:
+                out['siglip_feats'] = comfy.conds.CONDList(sigfeats)
+
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            latents = []
+            for lat in ref_latents:
+                latents.append(self.process_latent_in(lat))
+            out['ref_latents'] = comfy.conds.CONDList(latents)
+
+        ref_contexts = kwargs.get("reference_latents_text_embeds", None)
+        if ref_contexts is not None:
+            out['ref_contexts'] = comfy.conds.CONDList(ref_contexts)
+
+        return out
+
+    def extra_conds_shapes(self, **kwargs):
+        out = {}
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
        return out

 class WAN21(BaseModel):
@@ -1489,6 +1541,47 @@ class ACEStep(BaseModel):
        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
        return out

+class ACEStep15(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.ace_step15.AceStepConditionGenerationModel)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        device = kwargs["device"]
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
+        if cross_attn is not None:
+            out['lyric_embed'] = comfy.conds.CONDRegular(conditioning_lyrics)
+
+        refer_audio = kwargs.get("reference_audio_timbre_latents", None)
+        if refer_audio is None or len(refer_audio) == 0:
+            refer_audio = torch.tensor([[[-1.3672e-01, -1.5820e-01,  5.8594e-01, -5.7422e-01,  3.0273e-02,
+                                        2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
+                                        -2.7710e-02, -1.8066e-01, -2.9688e-01,  1.6016e+00, -2.6719e+00,
+                                        7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
+                                        2.4316e-01,  4.7266e-01,  4.6387e-02, -6.6406e-01, -2.1973e-01,
+                                        -6.7578e-01, -1.5723e-01,  9.5312e-01, -2.0020e-01, -1.7109e+00,
+                                        5.8984e-01, -5.7422e-01,  5.1562e-01,  2.8320e-01,  1.4551e-01,
+                                        -1.8750e-01, -5.9814e-02,  3.6719e-01, -1.0059e-01, -1.5723e-01,
+                                        2.0605e-01, -4.3359e-01, -8.2812e-01,  4.5654e-02, -6.6016e-01,
+                                        1.4844e-01,  9.4727e-02,  3.8477e-01, -1.2578e+00, -3.3203e-01,
+                                        -8.5547e-01,  4.3359e-01,  4.2383e-01, -8.9453e-01, -5.0391e-01,
+                                        -5.6152e-02, -2.9219e+00, -2.4658e-02,  5.0391e-01,  9.8438e-01,
+                                        7.2754e-02, -2.1582e-01,  6.3672e-01,  1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, 750)
+        else:
+            refer_audio = refer_audio[-1]
+        out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
+
+        audio_codes = kwargs.get("audio_codes", None)
+        if audio_codes is not None:
+            out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
+
+        return out
+
 class Omnigen2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel)
@@ -1526,6 +1619,9 @@ class QwenImage(BaseModel):

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
@@ -253,7 +253,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                dit_config["image_model"] = "chroma_radiance"
                dit_config["in_channels"] = 3
                dit_config["out_channels"] = 3
-                dit_config["patch_size"] = 16
+                dit_config["patch_size"] = state_dict.get('{}img_in_patch.weight'.format(key_prefix)).size(dim=-1)
                dit_config["nerf_hidden_size"] = 64
                dit_config["nerf_mlp_ratio"] = 4
                dit_config["nerf_depth"] = 4
@@ -444,8 +444,15 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["ffn_dim_multiplier"] = (8.0 / 3.0)
            dit_config["z_image_modulation"] = True
            dit_config["time_scale"] = 1000.0
+            try:
+                dit_config["allow_fp16"] = torch.std(state_dict['{}layers.{}.ffn_norm1.weight'.format(key_prefix, dit_config["n_layers"] - 2)], unbiased=False).item() < 0.42
+            except Exception:
+                pass
            if '{}cap_pad_token'.format(key_prefix) in state_dict_keys:
                dit_config["pad_tokens_multiple"] = 32
+            sig_weight = state_dict.get('{}siglip_embedder.0.weight'.format(key_prefix), None)
+            if sig_weight is not None:
+                dit_config["siglip_feat_dim"] = sig_weight.shape[0]

        return dit_config

@@ -547,6 +554,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}blocks.0.mlp.layer1.weight'.format(key_prefix) in state_dict_keys:  # Cosmos predict2
        dit_config = {}
        dit_config["image_model"] = "cosmos_predict2"
+        if "{}llm_adapter.blocks.0.cross_attn.q_proj.weight".format(key_prefix) in state_dict_keys:
+            dit_config["image_model"] = "anima"
        dit_config["max_img_h"] = 240
        dit_config["max_img_w"] = 240
        dit_config["max_frames"] = 128
@@ -646,6 +655,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.')
        return dit_config

+    if '{}encoder.lyric_encoder.layers.0.input_layernorm.weight'.format(key_prefix) in state_dict_keys:
+        dit_config = {}
+        dit_config["audio_model"] = "ace1.5"
+        return dit_config
+
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

@@ -19,13 +19,21 @@
 import psutil
 import logging
 from enum import Enum
-from comfy.cli_args import args, PerformanceFeature
+from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram
+import threading
 import torch
 import sys
 import platform
 import weakref
 import gc
 import os
+from contextlib import nullcontext
+import comfy.memory_management
+import comfy.utils
+import comfy.quant_ops
+
+import comfy_aimdo.torch
+import comfy_aimdo.model_vbar

 class VRAMState(Enum):
    DISABLED = 0    #No vram present: no need to move models to vram
@@ -578,9 +586,15 @@ WINDOWS = any(platform.win32_ver())

 EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
 if WINDOWS:
+    import comfy.windows
    EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
    if total_vram > (15 * 1024):  # more extra reserved vram on 16GB+ cards
        EXTRA_RESERVED_VRAM += 100 * 1024 * 1024
+    def get_free_ram():
+        return comfy.windows.get_free_ram()
+else:
+    def get_free_ram():
+        return psutil.virtual_memory().available

 if args.reserve_vram is not None:
    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
@@ -592,7 +606,7 @@ def extra_reserved_memory():
 def minimum_inference_memory():
    return (1024 * 1024 * 1024) * 0.8 + extra_reserved_memory()

-def free_memory(memory_required, device, keep_loaded=[]):
+def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_required=0):
    cleanup_models_gc()
    unloaded_model = []
    can_unload = []
@@ -607,15 +621,23 @@ def free_memory(memory_required, device, keep_loaded=[]):

    for x in sorted(can_unload):
        i = x[-1]
-        memory_to_free = None
+        memory_to_free = 1e32
+        ram_to_free = 1e32
        if not DISABLE_SMART_MEMORY:
-            free_mem = get_free_memory(device)
-            if free_mem > memory_required:
-                break
-            memory_to_free = memory_required - free_mem
-        logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
-        if current_loaded_models[i].model_unload(memory_to_free):
+            memory_to_free = memory_required - get_free_memory(device)
+            ram_to_free = ram_required - get_free_ram()
+
+        if current_loaded_models[i].model.is_dynamic() and for_dynamic:
+            #don't actually unload dynamic models for the sake of other dynamic models
+            #as that works on-demand.
+            memory_required -= current_loaded_models[i].model.loaded_size()
+            memory_to_free = 0
+        if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free):
+            logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
            unloaded_model.append(i)
+        if ram_to_free > 0:
+            logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}")
+            current_loaded_models[i].model.partially_unload_ram(ram_to_free)

    for i in sorted(unloaded_model, reverse=True):
        unloaded_models.append(current_loaded_models.pop(i))
@@ -629,7 +651,7 @@ def free_memory(memory_required, device, keep_loaded=[]):
                soft_empty_cache()
    return unloaded_models

-def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
    cleanup_models_gc()
    global vram_state

@@ -650,7 +672,10 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu

    models_to_load = []

+    free_for_dynamic=True
    for x in models:
+        if not x.is_dynamic():
+            free_for_dynamic = False
        loaded_model = LoadedModel(x)
        try:
            loaded_model_index = current_loaded_models.index(loaded_model)
@@ -676,19 +701,25 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
            model_to_unload.model.detach(unpatch_all=False)
            model_to_unload.model_finalizer.detach()

+
    total_memory_required = {}
+    total_ram_required = {}
    for loaded_model in models_to_load:
        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+        #x2, one to make sure the OS can fit the model for loading in disk cache, and for us to do any pinning we
+        #want to do.
+        #FIXME: This should subtract off the to_load current pin consumption.
+        total_ram_required[loaded_model.device] = total_ram_required.get(loaded_model.device, 0) + loaded_model.model_memory() * 2

    for device in total_memory_required:
        if device != torch.device("cpu"):
-            free_memory(total_memory_required[device] * 1.1 + extra_mem, device)
+            free_memory(total_memory_required[device] * 1.1 + extra_mem, device, for_dynamic=free_for_dynamic, ram_required=total_ram_required[device])

    for device in total_memory_required:
        if device != torch.device("cpu"):
            free_mem = get_free_memory(device)
            if free_mem < minimum_memory_required:
-                models_l = free_memory(minimum_memory_required, device)
+                models_l = free_memory(minimum_memory_required, device, for_dynamic=free_for_dynamic)
                logging.info("{} models unloaded.".format(len(models_l)))

    for loaded_model in models_to_load:
@@ -716,6 +747,26 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
        current_loaded_models.insert(0, loaded_model)
    return

+def load_models_gpu_thread(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load):
+    with torch.inference_mode():
+        load_models_gpu_orig(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
+        soft_empty_cache()
+
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+    #Deliberately load models outside of the Aimdo mempool so they can be retained accross
+    #nodes. Use a dummy thread to do it as pytorch documents that mempool contexts are
+    #thread local. So exploit that to escape context
+    if enables_dynamic_vram():
+        t = threading.Thread(
+            target=load_models_gpu_thread,
+            args=(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
+        )
+        t.start()
+        t.join()
+    else:
+        load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights,
+                             minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
+
 def load_model_gpu(model):
    return load_models_gpu([model])

@@ -732,6 +783,9 @@ def loaded_models(only_currently_used=False):

 def cleanup_models_gc():
    do_gc = False
+
+    reset_cast_buffers()
+
    for i in range(len(current_loaded_models)):
        cur = current_loaded_models[i]
        if cur.is_dead():
@@ -749,6 +803,11 @@ def cleanup_models_gc():
                logging.warning("WARNING, memory leak with model {}. Please make sure it is not being referenced from somewhere.".format(cur.real_model().__class__.__name__))


+def archive_model_dtypes(model):
+    for name, module in model.named_modules():
+        for param_name, param in module.named_parameters(recurse=False):
+            setattr(module, f"{param_name}_comfy_model_dtype", param.dtype)
+

 def cleanup_models():
    to_delete = []
@@ -792,7 +851,7 @@ def unet_inital_load_device(parameters, dtype):

    mem_dev = get_free_memory(torch_dev)
    mem_cpu = get_free_memory(cpu_dev)
-    if mem_dev > mem_cpu and model_size < mem_dev:
+    if mem_dev > mem_cpu and model_size < mem_dev and comfy.memory_management.aimdo_allocator is None:
        return torch_dev
    else:
        return cpu_dev
@@ -1051,6 +1110,51 @@ def current_stream(device):
        return None

 stream_counters = {}
+
+STREAM_CAST_BUFFERS = {}
+LARGEST_CASTED_WEIGHT = (None, 0)
+
+def get_cast_buffer(offload_stream, device, size, ref):
+    global LARGEST_CASTED_WEIGHT
+
+    if offload_stream is not None:
+        wf_context = offload_stream
+        if hasattr(wf_context, "as_context"):
+            wf_context = wf_context.as_context(offload_stream)
+    else:
+        wf_context = nullcontext()
+
+    cast_buffer = STREAM_CAST_BUFFERS.get(offload_stream, None)
+    if cast_buffer is None or cast_buffer.numel() < size:
+        if ref is LARGEST_CASTED_WEIGHT[0]:
+            #If there is one giant weight we do not want both streams to
+            #allocate a buffer for it. It's up to the caster to get the other
+            #offload stream in this corner case
+            return None
+        if cast_buffer is not None and cast_buffer.numel() > 50 * (1024 ** 2):
+            #I want my wrongly sized 50MB+ of VRAM back from the caching allocator right now
+            synchronize()
+            del STREAM_CAST_BUFFERS[offload_stream]
+            del cast_buffer
+            #FIXME: This doesn't work in Aimdo because mempool cant clear cache
+            soft_empty_cache()
+        with wf_context:
+            cast_buffer = torch.empty((size), dtype=torch.int8, device=device)
+            STREAM_CAST_BUFFERS[offload_stream] = cast_buffer
+
+        if  size > LARGEST_CASTED_WEIGHT[1]:
+            LARGEST_CASTED_WEIGHT = (ref, size)
+
+    return cast_buffer
+
+def reset_cast_buffers():
+    global LARGEST_CASTED_WEIGHT
+    LARGEST_CASTED_WEIGHT = (None, 0)
+    for offload_stream in STREAM_CAST_BUFFERS:
+        offload_stream.synchronize()
+    STREAM_CAST_BUFFERS.clear()
+    soft_empty_cache()
+
 def get_offload_stream(device):
    stream_counter = stream_counters.get(device, 0)
    if NUM_STREAMS == 0:
@@ -1093,7 +1197,62 @@ def sync_stream(device, stream):
        return
    current_stream(device).wait_stream(stream)

-def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
+
+def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
+    wf_context = nullcontext()
+    if stream is not None:
+       wf_context = stream
+       if hasattr(wf_context, "as_context"):
+           wf_context = wf_context.as_context(stream)
+
+    dest_views = comfy.memory_management.interpret_gathered_like(tensors, r)
+    with wf_context:
+        for tensor in tensors:
+            dest_view = dest_views.pop(0)
+            if tensor is None:
+                continue
+            dest_view.copy_(tensor, non_blocking=non_blocking)
+
+
+def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None):
+    if hasattr(weight, "_v"):
+        #Unexpected usage patterns. There is no reason these don't work but they
+        #have no testing and no callers do this.
+        assert r is None
+        assert stream is None
+
+        cast_geometry = comfy.memory_management.tensors_to_geometries([ weight ])
+
+        if dtype is None:
+            dtype = weight._model_dtype
+
+        r = torch.empty_like(weight, dtype=dtype, device=device)
+
+        signature = comfy_aimdo.model_vbar.vbar_fault(weight._v)
+        if signature is not None:
+            raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device)
+            v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0]
+            if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature):
+                weight._v_signature = signature
+                #Send it over
+                v_tensor.copy_(weight, non_blocking=non_blocking)
+            #always take a deep copy even if _v is good, as we have no reasonable point to unpin
+            #a non comfy weight
+            r.copy_(v_tensor)
+            comfy_aimdo.model_vbar.vbar_unpin(weight._v)
+            return r
+
+        if weight.dtype != r.dtype and weight.dtype != weight._model_dtype:
+            #Offloaded casting could skip this, however it would make the quantizations
+            #inconsistent between loaded and offloaded weights. So force the double casting
+            #that would happen in regular flow to make offload deterministic.
+            cast_buffer = torch.empty_like(weight, dtype=weight._model_dtype, device=device)
+            cast_buffer.copy_(weight, non_blocking=non_blocking)
+            weight = cast_buffer
+        r.copy_(weight, non_blocking=non_blocking)
+
+        return r
+
    if device is None or weight.device == device:
        if not copy:
            if dtype is None or weight.dtype == dtype:
@@ -1112,10 +1271,12 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
        if hasattr(wf_context, "as_context"):
            wf_context = wf_context.as_context(stream)
        with wf_context:
-            r = torch.empty_like(weight, dtype=dtype, device=device)
+            if r is None:
+                r = torch.empty_like(weight, dtype=dtype, device=device)
            r.copy_(weight, non_blocking=non_blocking)
    else:
-        r = torch.empty_like(weight, dtype=dtype, device=device)
+        if r is None:
+            r = torch.empty_like(weight, dtype=dtype, device=device)
        r.copy_(weight, non_blocking=non_blocking)
    return r

@@ -1135,14 +1296,14 @@ if not args.disable_pinned_memory:
            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95
        logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))

-PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"])
+PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])

 def discard_cuda_async_error():
    try:
        a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
        b = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
        _ = a + b
-        torch.cuda.synchronize()
+        synchronize()
    except torch.AcceleratorError:
        #Dump it! We already know about it from the synchronous return
        pass
@@ -1546,6 +1707,12 @@ def lora_compute_dtype(device):
    LORA_COMPUTE_DTYPES[device] = dtype
    return dtype

+def synchronize():
+    if is_intel_xpu():
+        torch.xpu.synchronize()
+    elif torch.cuda.is_available():
+        torch.cuda.synchronize()
+
 def soft_empty_cache(force=False):
    global cpu_state
    if cpu_state == CPUState.MPS:
@@ -1557,6 +1724,7 @@ def soft_empty_cache(force=False):
    elif is_mlu():
        torch.mlu.empty_cache()
    elif torch.cuda.is_available():
+        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

@@ -1568,9 +1736,6 @@ def debug_memory_summary():
        return torch.cuda.memory.memory_summary()
    return ""

-#TODO: might be cleaner to put this somewhere else
-import threading
-
 class InterruptProcessingException(Exception):
    pass

@@ -38,19 +38,7 @@ from comfy.comfy_types import UnetWrapperFunction
 from comfy.quant_ops import QuantizedTensor
 from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP

-
-def string_to_seed(data):
-    crc = 0xFFFFFFFF
-    for byte in data:
-        if isinstance(byte, str):
-            byte = ord(byte)
-        crc ^= byte
-        for _ in range(8):
-            if crc & 1:
-                crc = (crc >> 1) ^ 0xEDB88320
-            else:
-                crc >>= 1
-    return crc ^ 0xFFFFFFFF
+import comfy_aimdo.model_vbar

 def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
    to = model_options["transformer_options"].copy()
@@ -123,6 +111,10 @@ def move_weight_functions(m, device):
                memory += f.move_to(device=device)
    return memory

+def string_to_seed(data):
+    logging.warning("WARNING: string_to_seed has moved from comfy.model_patcher to comfy.utils")
+    return comfy.utils.string_to_seed(data)
+
 class LowVramPatch:
    def __init__(self, key, patches, convert_func=None, set_func=None):
        self.key = key
@@ -169,6 +161,11 @@ def get_key_weight(model, key):

    return weight, set_func, convert_func

+def key_param_name_to_key(key, param):
+    if len(key) == 0:
+        return param
+    return "{}.{}".format(key, param)
+
 class AutoPatcherEjector:
    def __init__(self, model: 'ModelPatcher', skip_and_inject_on_exit_only=False):
        self.model = model
@@ -212,6 +209,27 @@ class MemoryCounter:
    def decrement(self, used: int):
        self.value -= used

+CustomTorchDevice = collections.namedtuple("FakeDevice", ["type", "index"])("comfy-lazy-caster", 0)
+
+class LazyCastingParam(torch.nn.Parameter):
+    def __new__(cls, model, key, tensor):
+        return super().__new__(cls, tensor)
+
+    def __init__(self, model, key, tensor):
+        self.model = model
+        self.key = key
+
+    @property
+    def device(self):
+        return CustomTorchDevice
+
+    #safetensors will .to() us to the cpu which we catch here to cast on demand. The returned tensor is
+    #then just a short lived thing in the safetensors serialization logic inside its big for loop over
+    #all weights getting garbage collected per-weight
+    def to(self, *args, **kwargs):
+        return self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True).to("cpu")
+
+
 class ModelPatcher:
    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
        self.size = size
@@ -269,6 +287,9 @@ class ModelPatcher:
        if not hasattr(self.model, 'model_offload_buffer_memory'):
            self.model.model_offload_buffer_memory = 0

+    def is_dynamic(self):
+        return False
+
    def model_size(self):
        if self.size > 0:
            return self.size
@@ -284,6 +305,9 @@ class ModelPatcher:
    def lowvram_patch_counter(self):
        return self.model.lowvram_patch_counter

+    def get_free_memory(self, device):
+        return comfy.model_management.get_free_memory(device)
+
    def clone(self):
        n = self.__class__(self.model, self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
@@ -611,14 +635,14 @@ class ModelPatcher:
                        sd.pop(k)
            return sd

-    def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
-        if key not in self.patches:
-            return
-
+    def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False):
        weight, set_func, convert_func = get_key_weight(self.model, key)
+        if key not in self.patches:
+            return weight
+
        inplace_update = self.weight_inplace_update or inplace_update

-        if key not in self.backup:
+        if key not in self.backup and not return_weight:
            self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)

        temp_dtype = comfy.model_management.lora_compute_dtype(device_to)
@@ -631,13 +655,15 @@ class ModelPatcher:

        out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
        if set_func is None:
-            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
-            if inplace_update:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
+            if return_weight:
+                return out_weight
+            elif inplace_update:
                comfy.utils.copy_to_param(self.model, key, out_weight)
            else:
                comfy.utils.set_attr_param(self.model, key, out_weight)
        else:
-            set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))
+            return set_func(out_weight, inplace_update=inplace_update, seed=comfy.utils.string_to_seed(key), return_weight=return_weight)

    def pin_weight_to_device(self, key):
        weight, set_func, convert_func = get_key_weight(self.model, key)
@@ -654,7 +680,7 @@ class ModelPatcher:
        for key in list(self.pinned):
            self.unpin_weight(key)

-    def _load_list(self):
+    def _load_list(self, prio_comfy_cast_weights=False):
        loading = []
        for n, m in self.model.named_modules():
            params = []
@@ -681,7 +707,8 @@ class ModelPatcher:
                        return 0
                    module_offload_mem += check_module_offload_mem("{}.weight".format(n))
                    module_offload_mem += check_module_offload_mem("{}.bias".format(n))
-                loading.append((module_offload_mem, module_mem, n, m, params))
+                prepend = (not hasattr(m, "comfy_cast_weights"),) if prio_comfy_cast_weights else ()
+                loading.append(prepend + (module_offload_mem, module_mem, n, m, params))
        return loading

    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
@@ -773,7 +800,7 @@ class ModelPatcher:
                        continue

                for param in params:
-                    key = "{}.{}".format(n, param)
+                    key = key_param_name_to_key(n, param)
                    self.unpin_weight(key)
                    self.patch_weight_to_device(key, device_to=device_to)
                if comfy.model_management.is_device_cuda(device_to):
@@ -789,7 +816,7 @@ class ModelPatcher:
                n = x[1]
                params = x[3]
                for param in params:
-                    self.pin_weight_to_device("{}.{}".format(n, param))
+                    self.pin_weight_to_device(key_param_name_to_key(n, param))

            usable_stat = "{:.2f} MB usable,".format(lowvram_model_memory / (1024 * 1024)) if lowvram_model_memory < 1e32 else ""
            if lowvram_counter > 0:
@@ -895,7 +922,7 @@ class ModelPatcher:
                if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
                    move_weight = True
                    for param in params:
-                        key = "{}.{}".format(n, param)
+                        key = key_param_name_to_key(n, param)
                        bk = self.backup.get(key, None)
                        if bk is not None:
                            if not lowvram_possible:
@@ -946,7 +973,7 @@ class ModelPatcher:
                        logging.debug("freed {}".format(n))

                        for param in params:
-                            self.pin_weight_to_device("{}.{}".format(n, param))
+                            self.pin_weight_to_device(key_param_name_to_key(n, param))


            self.model.model_lowvram = True
@@ -984,6 +1011,9 @@ class ModelPatcher:

            return self.model.model_loaded_weight_memory - current_used

+    def partially_unload_ram(self, ram_to_unload):
+        pass
+
    def detach(self, unpatch_all=True):
        self.eject_model()
        self.model_patches_to(self.offload_device)
@@ -1317,10 +1347,10 @@ class ModelPatcher:
                                                 key, original_weights=original_weights)
        del original_weights[key]
        if set_func is None:
-            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
            comfy.utils.copy_to_param(self.model, key, out_weight)
        else:
-            set_func(out_weight, inplace_update=True, seed=string_to_seed(key))
+            set_func(out_weight, inplace_update=True, seed=comfy.utils.string_to_seed(key))
        if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
            # TODO: disable caching if not enough system RAM to do so
            target_device = self.offload_device
@@ -1355,7 +1385,249 @@ class ModelPatcher:
        self.unpatch_hooks()
        self.clear_cached_hook_weights()

+    def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
+        unet_state_dict = self.model.diffusion_model.state_dict()
+        for k, v in unet_state_dict.items():
+            op_keys = k.rsplit('.', 1)
+            if (len(op_keys) < 2) or op_keys[1] not in ["weight", "bias"]:
+                continue
+            try:
+                op = comfy.utils.get_attr(self.model.diffusion_model, op_keys[0])
+            except:
+                continue
+            if not op or not hasattr(op, "comfy_cast_weights") or \
+                (hasattr(op, "comfy_patched_weights") and op.comfy_patched_weights == True):
+                continue
+            key = "diffusion_model." + k
+            unet_state_dict[k] = LazyCastingParam(self, key, comfy.utils.get_attr(self.model, key))
+        return self.model.state_dict_for_saving(unet_state_dict)
+
    def __del__(self):
        self.unpin_all_weights()
        self.detach(unpatch_all=False)

+class ModelPatcherDynamic(ModelPatcher):
+
+    def __new__(cls, model=None, load_device=None, offload_device=None, size=0, weight_inplace_update=False):
+        if load_device is not None and comfy.model_management.is_device_cpu(load_device):
+            #reroute to default MP for CPUs
+            return ModelPatcher(model, load_device, offload_device, size, weight_inplace_update)
+        return super().__new__(cls)
+
+    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
+        super().__init__(model, load_device, offload_device, size, weight_inplace_update)
+        #this is now way more dynamic and we dont support the same base model for both Dynamic
+        #and non-dynamic patchers.
+        if hasattr(self.model, "model_loaded_weight_memory"):
+            del self.model.model_loaded_weight_memory
+        if not hasattr(self.model, "dynamic_vbars"):
+            self.model.dynamic_vbars = {}
+        assert load_device is not None
+
+    def is_dynamic(self):
+        return True
+
+    def _vbar_get(self, create=False):
+        if self.load_device == torch.device("cpu"):
+            return None
+        vbar = self.model.dynamic_vbars.get(self.load_device, None)
+        if create and vbar is None:
+            # x10. We dont know what model defined type casts we have in the vbar, but virtual address
+            # space is pretty free. This will cover someone casting an entire model from FP4 to FP32
+            # with some left over.
+            vbar = comfy_aimdo.model_vbar.ModelVBAR(self.model_size() * 10, self.load_device.index)
+            self.model.dynamic_vbars[self.load_device] = vbar
+        return vbar
+
+    def loaded_size(self):
+        vbar = self._vbar_get()
+        if vbar is None:
+            return 0
+        return vbar.loaded_size()
+
+    def get_free_memory(self, device):
+        #NOTE: on high condition / batch counts, estimate should have already vacated
+        #all non-dynamic models so this is safe even if its not 100% true that this
+        #would all be avaiable for inference use.
+        return comfy.model_management.get_total_memory(device) - self.model_size()
+
+    #Pinning is deferred to ops time. Assert against this API to avoid pin leaks.
+
+    def pin_weight_to_device(self, key):
+        raise RuntimeError("pin_weight_to_device invalid for dymamic weight loading")
+
+    def unpin_weight(self, key):
+        raise RuntimeError("unpin_weight invalid for dymamic weight loading")
+
+    def unpin_all_weights(self):
+        self.partially_unload_ram(1e32)
+
+    def memory_required(self, input_shape):
+        #Pad this significantly. We are trying to get away from precise estimates. This
+        #estimate is only used when using the ModelPatcherDynamic after ModelPatcher. If you
+        #use all ModelPatcherDynamic this is ignored and its all done dynamically.
+        return super().memory_required(input_shape=input_shape) * 1.3 + (1024 ** 3)
+
+
+    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False, dirty=False):
+
+        #Force patching doesn't make sense in Dynamic loading, as you dont know what does and
+        #doesn't need to be forced at this stage. The only thing you could do would be patch
+        #it all on CPU which consumes huge RAM.
+        assert not force_patch_weights
+
+        #Full load doesn't make sense as we dont actually have any loader capability here and
+        #now.
+        assert not full_load
+
+        assert device_to == self.load_device
+
+        num_patches = 0
+        allocated_size = 0
+
+        with self.use_ejected():
+            self.unpatch_hooks()
+
+            vbar = self._vbar_get(create=True)
+            if vbar is not None:
+                vbar.prioritize()
+
+            #We have way more tools for acceleration on comfy weight offloading, so always
+            #prioritize the non-comfy weights (note the order reverse).
+            loading = self._load_list(prio_comfy_cast_weights=True)
+            loading.sort(reverse=True)
+
+            for x in loading:
+                _, _, _, n, m, params = x
+
+                def set_dirty(item, dirty):
+                    if dirty or not hasattr(item, "_v_signature"):
+                        item._v_signature = None
+
+                def setup_param(self, m, n, param_key):
+                    nonlocal num_patches
+                    key = key_param_name_to_key(n, param_key)
+
+                    weight_function = []
+
+                    weight, _, _ = get_key_weight(self.model, key)
+                    if weight is None:
+                        return 0
+                    if key in self.patches:
+                        setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches))
+                        num_patches += 1
+                    else:
+                        setattr(m, param_key + "_lowvram_function", None)
+
+                    if key in self.weight_wrapper_patches:
+                        weight_function.extend(self.weight_wrapper_patches[key])
+                    setattr(m, param_key + "_function", weight_function)
+                    geometry = weight
+                    if not isinstance(weight, QuantizedTensor):
+                        model_dtype = getattr(m, param_key + "_comfy_model_dtype", weight.dtype)
+                        weight._model_dtype = model_dtype
+                        geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype)
+                    return comfy.memory_management.vram_aligned_size(geometry)
+
+                if hasattr(m, "comfy_cast_weights"):
+                    m.comfy_cast_weights = True
+                    m.pin_failed = False
+                    m.seed_key = n
+                    set_dirty(m, dirty)
+
+                    v_weight_size = 0
+                    v_weight_size += setup_param(self, m, n, "weight")
+                    v_weight_size += setup_param(self, m, n, "bias")
+
+                    if vbar is not None and not hasattr(m, "_v"):
+                        m._v = vbar.alloc(v_weight_size)
+                    allocated_size += v_weight_size
+
+                else:
+                    for param in params:
+                        key = key_param_name_to_key(n, param)
+                        weight, _, _ = get_key_weight(self.model, key)
+                        weight.seed_key = key
+                        set_dirty(weight, dirty)
+                        geometry = weight
+                        model_dtype = getattr(m, param + "_comfy_model_dtype", weight.dtype)
+                        geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype)
+                        weight_size = geometry.numel() * geometry.element_size()
+                        if vbar is not None and not hasattr(weight, "_v"):
+                            weight._v = vbar.alloc(weight_size)
+                            weight._model_dtype = model_dtype
+                        allocated_size += weight_size
+
+            logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.")
+
+            self.model.device = device_to
+            self.model.current_weight_patches_uuid = self.patches_uuid
+
+            for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
+                #These are all super dangerous. Who knows what the custom nodes actually do here...
+                callback(self, device_to, lowvram_model_memory, force_patch_weights, full_load)
+
+            self.apply_hooks(self.forced_hooks, force_apply=True)
+
+    def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=False):
+        assert not force_patch_weights #See above
+        assert self.load_device != torch.device("cpu")
+
+        vbar = self._vbar_get()
+        return 0 if vbar is None else vbar.free_memory(memory_to_free)
+
+    def partially_unload_ram(self, ram_to_unload):
+        loading = self._load_list(prio_comfy_cast_weights=True)
+        for x in loading:
+            _, _, _, _, m, _ = x
+            ram_to_unload -= comfy.pinned_memory.unpin_memory(m)
+            if ram_to_unload <= 0:
+                return
+
+    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
+        #This isn't used by the core at all and can only be to load a model out of
+        #the control of proper model_managment. If you are a custom node author reading
+        #this, the correct pattern is to call load_models_gpu() to get a proper
+        #managed load of your model.
+        assert not load_weights
+        return super().patch_model(load_weights=load_weights, force_patch_weights=force_patch_weights)
+
+    def unpatch_model(self, device_to=None, unpatch_weights=True):
+        super().unpatch_model(device_to=None, unpatch_weights=False)
+
+        if unpatch_weights:
+            self.partially_unload_ram(1e32)
+            self.partially_unload(None, 1e32)
+
+    def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
+        assert not force_patch_weights #See above
+        with self.use_ejected(skip_and_inject_on_exit_only=True):
+            dirty = self.model.current_weight_patches_uuid is not None and (self.model.current_weight_patches_uuid != self.patches_uuid)
+
+            self.unpatch_model(self.offload_device, unpatch_weights=False)
+            self.patch_model(load_weights=False)
+
+            try:
+                self.load(device_to, dirty=dirty)
+            except Exception as e:
+                self.detach()
+                raise e
+            #ModelPatcher::partially_load returns a number on what got loaded but
+            #nothing in core uses this and we have no data in the Dynamic world. Hit
+            #the custom node devs with a None rather than a 0 that would mislead any
+            #logic they might have.
+            return None
+
+    def patch_cached_hook_weights(self, cached_weights: dict, key: str, memory_counter: MemoryCounter):
+        assert False #Should be unreachable - we dont ever cache in the new implementation
+
+    def patch_hook_weight_to_device(self, hooks: comfy.hooks.HookGroup, combined_patches: dict, key: str, original_weights: dict, memory_counter: MemoryCounter):
+        if key not in combined_patches:
+            return
+
+        raise RuntimeError("Hooks not implemented in ModelPatcherDynamic. Please remove --fast arguments form ComfyUI startup")
+
+    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
+        pass
+
+CoreModelPatcher = ModelPatcher
@@ -19,10 +19,16 @@
 import torch
 import logging
 import comfy.model_management
-from comfy.cli_args import args, PerformanceFeature
+from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram
 import comfy.float
 import comfy.rmsnorm
 import json
+import comfy.memory_management
+import comfy.pinned_memory
+import comfy.utils
+
+import comfy_aimdo.model_vbar
+import comfy_aimdo.torch

 def run_every_op():
    if torch.compiler.is_compiling():
@@ -72,7 +78,115 @@ def cast_to_input(weight, input, non_blocking=False, copy=True):
    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)


-def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False):
+def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype):
+    offload_stream = None
+    xfer_dest = None
+    cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
+
+    signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
+    if signature is not None:
+        xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
+    resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
+
+    if not resident:
+        cast_dest = None
+
+        xfer_source = [ s.weight, s.bias ]
+
+        pin = comfy.pinned_memory.get_pin(s)
+        if pin is not None:
+            xfer_source = [ pin ]
+
+        for data, geometry in zip([ s.weight, s.bias ], cast_geometry):
+            if data is None:
+                continue
+            if data.dtype != geometry.dtype:
+                cast_dest = xfer_dest
+                if cast_dest is None:
+                    cast_dest = torch.empty((comfy.memory_management.vram_aligned_size(cast_geometry),), dtype=torch.uint8, device=device)
+                xfer_dest = None
+                break
+
+        dest_size = comfy.memory_management.vram_aligned_size(xfer_source)
+        offload_stream = comfy.model_management.get_offload_stream(device)
+        if xfer_dest is None and offload_stream is not None:
+                xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
+                if xfer_dest is None:
+                    offload_stream = comfy.model_management.get_offload_stream(device)
+                    xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
+        if xfer_dest is None:
+            xfer_dest = torch.empty((dest_size,), dtype=torch.uint8, device=device)
+            offload_stream = None
+
+        if signature is None and pin is None:
+            comfy.pinned_memory.pin_memory(s)
+            pin = comfy.pinned_memory.get_pin(s)
+        else:
+            pin = None
+
+        if pin is not None:
+            comfy.model_management.cast_to_gathered(xfer_source, pin)
+            xfer_source = [ pin ]
+        #send it over
+        comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        comfy.model_management.sync_stream(device, offload_stream)
+
+        if cast_dest is not None:
+            for pre_cast, post_cast in zip(comfy.memory_management.interpret_gathered_like([s.weight, s.bias ], xfer_dest),
+                                           comfy.memory_management.interpret_gathered_like(cast_geometry, cast_dest)):
+                if post_cast is not None:
+                    post_cast.copy_(pre_cast)
+            xfer_dest = cast_dest
+
+    params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest)
+    weight = params[0]
+    bias = params[1]
+
+    def post_cast(s, param_key, x, dtype, resident, update_weight):
+        lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
+        fns = getattr(s, param_key + "_function", [])
+
+        orig = x
+
+        def to_dequant(tensor, dtype):
+            tensor = tensor.to(dtype=dtype)
+            if isinstance(tensor, QuantizedTensor):
+                tensor = tensor.dequantize()
+            return tensor
+
+        if orig.dtype != dtype or len(fns) > 0:
+            x = to_dequant(x, dtype)
+        if not resident and lowvram_fn is not None:
+            x = to_dequant(x, dtype if compute_dtype is None else compute_dtype)
+            #FIXME: this is not accurate, we need to be sensitive to the compute dtype
+            x = lowvram_fn(x)
+            if (isinstance(orig, QuantizedTensor) and
+                (orig.dtype == dtype and len(fns) == 0 or update_weight)):
+                seed = comfy.utils.string_to_seed(s.seed_key)
+                y = QuantizedTensor.from_float(x, s.layout_type, scale="recalculate", stochastic_rounding=seed)
+                if orig.dtype == dtype and len(fns) == 0:
+                    #The layer actually wants our freshly saved QT
+                    x = y
+            else:
+                y = x
+            if update_weight:
+                orig.copy_(y)
+        for f in fns:
+            x = f(x)
+        return x
+
+    update_weight = signature is not None
+
+    weight = post_cast(s, "weight", weight, dtype, resident, update_weight)
+    if s.bias is not None:
+        bias = post_cast(s, "bias", bias, bias_dtype, resident, update_weight)
+    s._v_signature=signature
+
+    #FIXME: weird offload return protocol
+    return weight, bias, (offload_stream, device if signature is not None else None, None)
+
+
+def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None):
    # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass
    # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This
    # will add async-offload support to your cast and improve performance.
@@ -87,22 +201,38 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
        if device is None:
            device = input.device

+    non_blocking = comfy.model_management.device_supports_non_blocking(device)
+
+    if hasattr(s, "_v"):
+        return cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype)
+
    if offloadable and (device != s.weight.device or
                        (s.bias is not None and device != s.bias.device)):
        offload_stream = comfy.model_management.get_offload_stream(device)
    else:
        offload_stream = None

-    non_blocking = comfy.model_management.device_supports_non_blocking(device)
+    bias = None
+    weight = None
+
+    if offload_stream is not None and not args.cuda_malloc:
+        cast_buffer_size = comfy.memory_management.vram_aligned_size([ s.weight, s.bias ])
+        cast_buffer = comfy.model_management.get_cast_buffer(offload_stream, device, cast_buffer_size, s)
+        #The streams can be uneven in buffer capability and reject us. Retry to get the other stream
+        if cast_buffer is None:
+            offload_stream = comfy.model_management.get_offload_stream(device)
+            cast_buffer = comfy.model_management.get_cast_buffer(offload_stream, device, cast_buffer_size, s)
+        params = comfy.memory_management.interpret_gathered_like([ s.weight, s.bias ], cast_buffer)
+        weight = params[0]
+        bias = params[1]

    weight_has_function = len(s.weight_function) > 0
    bias_has_function = len(s.bias_function) > 0

-    weight = comfy.model_management.cast_to(s.weight, None, device, non_blocking=non_blocking, copy=weight_has_function, stream=offload_stream)
+    weight = comfy.model_management.cast_to(s.weight, None, device, non_blocking=non_blocking, copy=weight_has_function, stream=offload_stream, r=weight)

-    bias = None
    if s.bias is not None:
-        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream)
+        bias = comfy.model_management.cast_to(s.bias, None, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream, r=bias)

    comfy.model_management.sync_stream(device, offload_stream)

@@ -110,6 +240,7 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
    weight_a = weight

    if s.bias is not None:
+        bias = bias.to(dtype=bias_dtype)
        for f in s.bias_function:
            bias = f(bias)

@@ -131,14 +262,20 @@ def uncast_bias_weight(s, weight, bias, offload_stream):
    if offload_stream is None:
        return
    os, weight_a, bias_a = offload_stream
+    device=None
+    #FIXME: This is not good RTTI
+    if not isinstance(weight_a, torch.Tensor):
+        comfy_aimdo.model_vbar.vbar_unpin(s._v)
+        device = weight_a
    if os is None:
        return
-    if weight_a is not None:
-        device = weight_a.device
-    else:
-        if bias_a is None:
-            return
-        device = bias_a.device
+    if device is None:
+        if weight_a is not None:
+            device = weight_a.device
+        else:
+            if bias_a is None:
+                return
+            device = bias_a.device
    os.wait_stream(comfy.model_management.current_stream(device))


@@ -149,6 +286,57 @@ class CastWeightBiasOp:

 class disable_weight_init:
    class Linear(torch.nn.Linear, CastWeightBiasOp):
+
+        def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
+            if not comfy.model_management.WINDOWS or not enables_dynamic_vram():
+                super().__init__(in_features, out_features, bias, device, dtype)
+                return
+
+            # Issue is with `torch.empty` still reserving the full memory for the layer.
+            # Windows doesn't over-commit memory so without this, We are momentarily commit
+            # charged for the weight even though we might zero-copy it when we load the
+            # state dict. If the commit charge exceeds the ceiling we can destabilize the
+            # system.
+            torch.nn.Module.__init__(self)
+            self.in_features = in_features
+            self.out_features = out_features
+            self.weight = None
+            self.bias = None
+            self.comfy_need_lazy_init_bias=bias
+            self.weight_comfy_model_dtype = dtype
+            self.bias_comfy_model_dtype = dtype
+
+        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
+                                strict, missing_keys, unexpected_keys, error_msgs):
+
+            if not comfy.model_management.WINDOWS or not enables_dynamic_vram():
+                return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                                     missing_keys, unexpected_keys, error_msgs)
+            assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
+            prefix_len = len(prefix)
+            for k,v in state_dict.items():
+                if k[prefix_len:] == "weight":
+                    if not assign_to_params_buffers:
+                        v = v.clone()
+                    self.weight = torch.nn.Parameter(v, requires_grad=False)
+                elif k[prefix_len:] == "bias" and v is not None:
+                    if not assign_to_params_buffers:
+                        v = v.clone()
+                    self.bias = torch.nn.Parameter(v, requires_grad=False)
+                else:
+                    unexpected_keys.append(k)
+
+            #Reconcile default construction of the weight if its missing.
+            if self.weight is None:
+                v = torch.zeros(self.in_features, self.out_features)
+                self.weight = torch.nn.Parameter(v, requires_grad=False)
+                missing_keys.append(prefix+"weight")
+            if self.bias is None and self.comfy_need_lazy_init_bias:
+                v = torch.zeros(self.out_features,)
+                self.bias = torch.nn.Parameter(v, requires_grad=False)
+                missing_keys.append(prefix+"bias")
+
+
        def reset_parameters(self):
            return None

@@ -203,7 +391,9 @@ class disable_weight_init:
        def reset_parameters(self):
            return None

-        def _conv_forward(self, input, weight, bias, *args, **kwargs):
+        def _conv_forward(self, input, weight, bias, autopad=None, *args, **kwargs):
+            if autopad == "causal_zero":
+                weight = weight[:, :, -input.shape[2]:, :, :]
            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
                if bias is not None:
@@ -212,15 +402,15 @@ class disable_weight_init:
            else:
                return super()._conv_forward(input, weight, bias, *args, **kwargs)

-        def forward_comfy_cast_weights(self, input):
+        def forward_comfy_cast_weights(self, input, autopad=None):
            weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
-            x = self._conv_forward(input, weight, bias)
+            x = self._conv_forward(input, weight, bias, autopad=autopad)
            uncast_bias_weight(self, weight, bias, offload_stream)
            return x

        def forward(self, *args, **kwargs):
            run_every_op()
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0 or "autopad" in kwargs:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -653,8 +843,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
            def _forward(self, input, weight, bias):
                return torch.nn.functional.linear(input, weight, bias)

-            def forward_comfy_cast_weights(self, input):
-                weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
+            def forward_comfy_cast_weights(self, input, compute_dtype=None):
+                weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True, compute_dtype=compute_dtype)
                x = self._forward(input, weight, bias)
                uncast_bias_weight(self, weight, bias, offload_stream)
                return x
@@ -664,6 +854,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec

                input_shape = input.shape
                reshaped_3d = False
+                #If cast needs to apply lora, it should be done in the compute dtype
+                compute_dtype = input.dtype

                if (getattr(self, 'layout_type', None) is not None and
                    not isinstance(input, QuantizedTensor) and not self._full_precision_mm and
@@ -682,7 +874,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
                            scale = comfy.model_management.cast_to_device(scale, input.device, None)
                        input = QuantizedTensor.from_float(input_reshaped, self.layout_type, scale=scale)

-                output = self.forward_comfy_cast_weights(input)
+
+                output = self.forward_comfy_cast_weights(input, compute_dtype)

                # Reshape output back to 3D if input was 3D
                if reshaped_3d:
@@ -0,0 +1,29 @@
+import torch
+import comfy.model_management
+import comfy.memory_management
+
+from comfy.cli_args import args
+
+def get_pin(module):
+    return getattr(module, "_pin", None)
+
+def pin_memory(module):
+    if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
+        return
+    #FIXME: This is a RAM cache trigger event
+    size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
+    pin = torch.empty((size,), dtype=torch.uint8)
+    if comfy.model_management.pin_memory(pin):
+        module._pin = pin
+    else:
+        module.pin_failed = True
+        return False
+    return True
+
+def unpin_memory(module):
+    if get_pin(module) is None:
+        return 0
+    size = module._pin.numel() * module._pin.element_size()
+    comfy.model_management.unpin_memory(module._pin)
+    del module._pin
+    return size
@@ -37,12 +37,18 @@ def prepare_noise(latent_image, seed, noise_inds=None):

    return noises

-def fix_empty_latent_channels(model, latent_image):
+def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None):
    if latent_image.is_nested:
        return latent_image
    latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
-    if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
-        latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
+    if torch.count_nonzero(latent_image) == 0:
+        if latent_format.latent_channels != latent_image.shape[1]:
+            latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
+        if downscale_ratio_spacial is not None:
+            if downscale_ratio_spacial != latent_format.spacial_downscale_ratio:
+                ratio = downscale_ratio_spacial / latent_format.spacial_downscale_ratio
+                latent_image = comfy.utils.common_upscale(latent_image, round(latent_image.shape[-1] * ratio), round(latent_image.shape[-2] * ratio), "nearest-exact", crop="disabled")
+
    if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
        latent_image = latent_image.unsqueeze(2)
    return latent_image
@@ -9,7 +9,6 @@ if TYPE_CHECKING:
 import torch
 from functools import partial
 import collections
-from comfy import model_management
 import math
 import logging
 import comfy.sampler_helpers
@@ -260,7 +259,7 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
            to_batch_temp.reverse()
            to_batch = to_batch_temp[:1]

-            free_memory = model_management.get_free_memory(x_in.device)
+            free_memory = model.current_patcher.get_free_memory(x_in.device)
            for i in range(1, len(to_batch_temp) + 1):
                batch_amount = to_batch_temp[:len(to_batch_temp)//i]
                input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
@@ -20,6 +20,7 @@ import comfy.ldm.ace.vae.music_dcae_pipeline
 import comfy.ldm.hunyuan_video.vae
 import comfy.ldm.mmaudio.vae.autoencoder
 import comfy.pixel_space_convert
+import comfy.weight_adapter
 import yaml
 import math
 import os
@@ -57,6 +58,8 @@ import comfy.text_encoders.ovis
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.jina_clip_2
 import comfy.text_encoders.newbie
+import comfy.text_encoders.anima
+import comfy.text_encoders.ace15

 import comfy.model_patcher
 import comfy.lora
@@ -100,6 +103,105 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
    return (new_modelpatcher, new_clip)


+def load_bypass_lora_for_models(model, clip, lora, strength_model, strength_clip):
+    """
+    Load LoRA in bypass mode without modifying base model weights.
+
+    Instead of patching weights, this injects the LoRA computation into the
+    forward pass: output = base_forward(x) + lora_path(x)
+
+    Non-adapter patches (bias diff, weight diff, etc.) are applied as regular patches.
+
+    This is useful for training and when model weights are offloaded.
+    """
+    key_map = {}
+    if model is not None:
+        key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
+    if clip is not None:
+        key_map = comfy.lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
+
+    logging.debug(f"[BypassLoRA] key_map has {len(key_map)} entries")
+
+    lora = comfy.lora_convert.convert_lora(lora)
+    loaded = comfy.lora.load_lora(lora, key_map)
+
+    logging.debug(f"[BypassLoRA] loaded has {len(loaded)} entries")
+
+    # Separate adapters (for bypass) from other patches (for regular patching)
+    bypass_patches = {}  # WeightAdapterBase instances -> bypass mode
+    regular_patches = {}  # diff, set, bias patches -> regular weight patching
+
+    for key, patch_data in loaded.items():
+        if isinstance(patch_data, comfy.weight_adapter.WeightAdapterBase):
+            bypass_patches[key] = patch_data
+        else:
+            regular_patches[key] = patch_data
+
+    logging.debug(f"[BypassLoRA] {len(bypass_patches)} bypass adapters, {len(regular_patches)} regular patches")
+
+    k = set()
+    k1 = set()
+
+    if model is not None:
+        new_modelpatcher = model.clone()
+
+        # Apply regular patches (bias diff, weight diff, etc.) via normal patching
+        if regular_patches:
+            patched_keys = new_modelpatcher.add_patches(regular_patches, strength_model)
+            k.update(patched_keys)
+
+        # Apply adapter patches via bypass injection
+        manager = comfy.weight_adapter.BypassInjectionManager()
+        model_sd_keys = set(new_modelpatcher.model.state_dict().keys())
+
+        for key, adapter in bypass_patches.items():
+            if key in model_sd_keys:
+                manager.add_adapter(key, adapter, strength=strength_model)
+                k.add(key)
+            else:
+                logging.warning(f"[BypassLoRA] Adapter key not in model state_dict: {key}")
+
+        injections = manager.create_injections(new_modelpatcher.model)
+
+        if manager.get_hook_count() > 0:
+            new_modelpatcher.set_injections("bypass_lora", injections)
+    else:
+        new_modelpatcher = None
+
+    if clip is not None:
+        new_clip = clip.clone()
+
+        # Apply regular patches to clip
+        if regular_patches:
+            patched_keys = new_clip.add_patches(regular_patches, strength_clip)
+            k1.update(patched_keys)
+
+        # Apply adapter patches via bypass injection
+        clip_manager = comfy.weight_adapter.BypassInjectionManager()
+        clip_sd_keys = set(new_clip.cond_stage_model.state_dict().keys())
+
+        for key, adapter in bypass_patches.items():
+            if key in clip_sd_keys:
+                clip_manager.add_adapter(key, adapter, strength=strength_clip)
+                k1.add(key)
+
+        clip_injections = clip_manager.create_injections(new_clip.cond_stage_model)
+        if clip_manager.get_hook_count() > 0:
+            new_clip.patcher.set_injections("bypass_lora", clip_injections)
+    else:
+        new_clip = None
+
+    for x in loaded:
+        if (x not in k) and (x not in k1):
+            patch_data = loaded[x]
+            patch_type = type(patch_data).__name__
+            if isinstance(patch_data, tuple):
+                patch_type = f"tuple({patch_data[0]})"
+            logging.warning(f"NOT LOADED: {x} (type={patch_type})")
+
+    return (new_modelpatcher, new_clip)
+
+
 class CLIP:
    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, state_dict=[], model_options={}):
        if no_init:
@@ -127,8 +229,10 @@ class CLIP:
                    self.cond_stage_model.to(offload_device)
                    logging.warning("Had to shift TE back.")

+        model_management.archive_model_dtypes(self.cond_stage_model)
+
        self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
-        self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
+        self.patcher = comfy.model_patcher.CoreModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
        #Match torch.float32 hardcode upcast in TE implemention
        self.patcher.set_model_compute_dtype(torch.float32)
        self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram
@@ -288,8 +392,18 @@ class CLIP:

    def load_sd(self, sd, full_model=False):
        if full_model:
-            return self.cond_stage_model.load_state_dict(sd, strict=False)
+            return self.cond_stage_model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())
        else:
+            can_assign = self.patcher.is_dynamic()
+            self.cond_stage_model.can_assign_sd = can_assign
+
+            # The CLIP models are a pretty complex web of wrappers and its
+            # a bit of an API change to plumb this all the way through.
+            # So spray paint the model with this flag that the loading
+            # nn.Module can then inspect for itself.
+            for m in self.cond_stage_model.modules():
+                m.can_assign_sd = can_assign
+
            return self.cond_stage_model.load_sd(sd)

    def get_sd(self):
@@ -339,6 +453,8 @@ class VAE:
        self.extra_1d_channel = None
        self.crop_input = True

+        self.audio_sample_rate = 44100
+
        if config is None:
            if "decoder.mid.block_1.mix_factor" in sd:
                encoder_config = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
@@ -436,14 +552,27 @@ class VAE:
                                                                    encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig},
                                                                    decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig})
            elif "decoder.layers.1.layers.0.beta" in sd:
-                self.first_stage_model = AudioOobleckVAE()
+                config = {}
+                param_key = None
+                self.upscale_ratio = 2048
+                self.downscale_ratio = 2048
+                if "decoder.layers.2.layers.1.weight_v" in sd:
+                    param_key = "decoder.layers.2.layers.1.weight_v"
+                if "decoder.layers.2.layers.1.parametrizations.weight.original1" in sd:
+                    param_key = "decoder.layers.2.layers.1.parametrizations.weight.original1"
+                if param_key is not None:
+                    if sd[param_key].shape[-1] == 12:
+                        config["strides"] = [2, 4, 4, 6, 10]
+                        self.audio_sample_rate = 48000
+                        self.upscale_ratio = 1920
+                        self.downscale_ratio = 1920
+
+                self.first_stage_model = AudioOobleckVAE(**config)
                self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype)
                self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * 2048) * model_management.dtype_size(dtype)
                self.latent_channels = 64
                self.output_channels = 2
                self.pad_channel_value = "replicate"
-                self.upscale_ratio = 2048
-                self.downscale_ratio =  2048
                self.latent_dim = 1
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
@@ -635,14 +764,13 @@ class VAE:
                self.upscale_index_formula = (4, 16, 16)
                self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
                self.downscale_index_formula = (4, 16, 16)
-                if self.latent_channels == 48: # Wan 2.2
+                if self.latent_channels in [48, 128]: # Wan 2.2 and LTX2
                    self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=None) # taehv doesn't need scaling
-                    self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
+                    self.process_input = self.process_output = lambda image: image
                    self.process_output = lambda image: image
                    self.memory_used_decode = lambda shape, dtype: (1800 * (max(1, (shape[-3] ** 0.7 * 0.1)) * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype))
                elif self.latent_channels == 32 and sd["decoder.22.bias"].shape[0] == 12: # lighttae_hv15
                    self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=comfy.latent_formats.HunyuanVideo15)
-                    self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
                    self.memory_used_decode = lambda shape, dtype: (1200 * (max(1, (shape[-3] ** 0.7 * 0.05)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
                else:
                    if sd["decoder.1.weight"].dtype == torch.float16: # taehv currently only available in float16, so assume it's not lighttaew2_1 as otherwise state dicts are identical
@@ -665,12 +793,7 @@ class VAE:
            self.first_stage_model = AutoencoderKL(**(config['params']))
        self.first_stage_model = self.first_stage_model.eval()

-        m, u = self.first_stage_model.load_state_dict(sd, strict=False)
-        if len(m) > 0:
-            logging.warning("Missing VAE keys {}".format(m))
-
-        if len(u) > 0:
-            logging.debug("Leftover VAE keys {}".format(u))
+        model_management.archive_model_dtypes(self.first_stage_model)

        if device is None:
            device = model_management.vae_device()
@@ -682,7 +805,18 @@ class VAE:
        self.first_stage_model.to(self.vae_dtype)
        self.output_device = model_management.intermediate_device()

-        self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
+        mp = comfy.model_patcher.CoreModelPatcher
+        if self.disable_offload:
+            mp = comfy.model_patcher.ModelPatcher
+        self.patcher = mp(self.first_stage_model, load_device=self.device, offload_device=offload_device)
+
+        m, u = self.first_stage_model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())
+        if len(m) > 0:
+            logging.warning("Missing VAE keys {}".format(m))
+
+        if len(u) > 0:
+            logging.debug("Leftover VAE keys {}".format(u))
+
        logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))
        self.model_size()

@@ -738,7 +872,7 @@ class VAE:
            / 3.0)
        return output

-    def decode_tiled_1d(self, samples, tile_x=128, overlap=32):
+    def decode_tiled_1d(self, samples, tile_x=256, overlap=32):
        if samples.ndim == 3:
            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        else:
@@ -797,7 +931,7 @@ class VAE:
        try:
            memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
-            free_memory = model_management.get_free_memory(self.device)
+            free_memory = self.patcher.get_free_memory(self.device)
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)

@@ -871,7 +1005,7 @@ class VAE:
        try:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
-            free_memory = model_management.get_free_memory(self.device)
+            free_memory = self.patcher.get_free_memory(self.device)
            batch_number = int(free_memory / max(1, memory_used))
            batch_number = max(1, batch_number)
            samples = None
@@ -1048,6 +1182,7 @@ class TEModel(Enum):
    GEMMA_3_12B = 18
    JINA_CLIP_2 = 19
    QWEN3_8B = 20
+    QWEN3_06B = 21


 def detect_te_model(sd):
@@ -1093,6 +1228,8 @@ def detect_te_model(sd):
                return TEModel.QWEN3_2B
            elif weight.shape[0] == 4096:
                return TEModel.QWEN3_8B
+            elif weight.shape[0] == 1024:
+                return TEModel.QWEN3_06B
        if weight.shape[0] == 5120:
            if "model.layers.39.post_attention_layernorm.weight" in sd:
                return TEModel.MISTRAL3_24B
@@ -1233,6 +1370,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif te_model == TEModel.JINA_CLIP_2:
            clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
            clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
+        elif te_model == TEModel.QWEN3_06B:
+            clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
        else:
            # clip_l
            if clip_type == CLIPType.SD3:
@@ -1303,6 +1443,14 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_data_jina = clip_data[0]
            tokenizer_data["gemma_spiece_model"] = clip_data_gemma.get("spiece_model", None)
            tokenizer_data["jina_spiece_model"] = clip_data_jina.get("spiece_model", None)
+        elif clip_type == CLIPType.ACE:
+            te_models = [detect_te_model(clip_data[0]), detect_te_model(clip_data[1])]
+            if TEModel.QWEN3_4B in te_models:
+                model_type = "qwen3_4b"
+            else:
+                model_type = "qwen3_2b"
+            clip_target.clip = comfy.text_encoders.ace15.te(lm_model=model_type, **llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.ace15.ACE15Tokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@@ -1326,7 +1474,7 @@ def load_gligen(ckpt_path):
    model = gligen.load_gligen(data)
    if model_management.should_use_fp16():
        model = model.half()
-    return comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device())
+    return comfy.model_patcher.CoreModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device())

 def model_detection_error_hint(path, state_dict):
    filename = os.path.basename(path)
@@ -1414,7 +1562,8 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    if output_model:
        inital_load_device = model_management.unet_inital_load_device(parameters, unet_dtype)
        model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
-        model.load_model_weights(sd, diffusion_model_prefix)
+        model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device())
+        model.load_model_weights(sd, diffusion_model_prefix, assign=model_patcher.is_dynamic())

    if output_vae:
        vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
@@ -1457,7 +1606,6 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
        logging.debug("left over keys: {}".format(left_over))

    if output_model:
-        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device())
        if inital_load_device != torch.device("cpu"):
            logging.info("loaded diffusion model directly to GPU")
            model_management.load_models_gpu([model_patcher], force_full_load=True)
@@ -1549,13 +1697,14 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None):
        model_config.optimizations["fp8"] = True

    model = model_config.get_model(new_sd, "")
-    model = model.to(offload_device)
-    model.load_model_weights(new_sd, "")
+    model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=load_device, offload_device=offload_device)
+    if not model_management.is_device_cpu(offload_device):
+        model.to(offload_device)
+    model.load_model_weights(new_sd, "", assign=model_patcher.is_dynamic())
    left_over = sd.keys()
    if len(left_over) > 0:
        logging.info("left over keys in diffusion model: {}".format(left_over))
-    return comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=offload_device)
-
+    return model_patcher

 def load_diffusion_model(unet_path, model_options={}):
    sd, metadata = comfy.utils.load_torch_file(unet_path, return_metadata=True)
@@ -1586,9 +1735,9 @@ def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, m
    if metadata is None:
        metadata = {}

-    model_management.load_models_gpu(load_models, force_patch_weights=True)
+    model_management.load_models_gpu(load_models)
    clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
-    sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
+    sd = model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
    for k in extra_keys:
        sd[k] = extra_keys[k]

@@ -155,6 +155,8 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        self.execution_device = options.get("execution_device", self.execution_device)
        if isinstance(self.layer, list) or self.layer == "all":
            pass
+        elif isinstance(layer_idx, list):
+            self.layer = layer_idx
        elif layer_idx is None or abs(layer_idx) > self.num_layers:
            self.layer = "last"
        else:
@@ -297,7 +299,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        return self(tokens)

    def load_sd(self, sd):
-        return self.transformer.load_state_dict(sd, strict=False)
+        return self.transformer.load_state_dict(sd, strict=False, assign=getattr(self, "can_assign_sd", False))

 def parse_parentheses(string):
    result = []
@@ -466,7 +468,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
    return embed_out

 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, disable_weights=False, tokenizer_data={}, tokenizer_args={}):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, start_token=None, min_padding=None, pad_left=False, disable_weights=False, tokenizer_data={}, tokenizer_args={}):
        if tokenizer_path is None:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
@@ -479,8 +481,15 @@ class SDTokenizer:
        empty = self.tokenizer('')["input_ids"]
        self.tokenizer_adds_end_token = has_end_token
        if has_start_token:
-            self.tokens_start = 1
-            self.start_token = empty[0]
+            if len(empty) > 0:
+                self.tokens_start = 1
+                self.start_token = empty[0]
+            else:
+                self.tokens_start = 0
+                self.start_token = start_token
+                if start_token is None:
+                    logging.warning("WARNING: There's something wrong with your tokenizers.'")
+
            if end_token is not None:
                self.end_token = end_token
            else:
@@ -488,7 +497,7 @@ class SDTokenizer:
                    self.end_token = empty[1]
        else:
            self.tokens_start = 0
-            self.start_token = None
+            self.start_token = start_token
            if end_token is not None:
                self.end_token = end_token
            else:
@@ -23,6 +23,8 @@ import comfy.text_encoders.qwen_image
 import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
+import comfy.text_encoders.anima
+import comfy.text_encoders.ace15

 from . import supported_models_base
 from . import latent_formats
@@ -770,10 +772,24 @@ class Flux2(Flux):
        return out

    def clip_target(self, state_dict={}):
-        return None # TODO
        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
+        if len(detect) > 0:
+            detect["model_type"] = "qwen3_4b"
+            return supported_models_base.ClipTarget(comfy.text_encoders.flux.KleinTokenizer, comfy.text_encoders.flux.klein_te(**detect))
+
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_8b.transformer.".format(pref))
+        if len(detect) > 0:
+            detect["model_type"] = "qwen3_8b"
+            return supported_models_base.ClipTarget(comfy.text_encoders.flux.KleinTokenizer8B, comfy.text_encoders.flux.klein_te(**detect))
+
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}mistral3_24b.transformer.".format(pref))
+        if len(detect) > 0:
+            if "{}mistral3_24b.transformer.model.layers.39.post_attention_layernorm.weight".format(pref) not in state_dict:
+                detect["pruned"] = True
+            return supported_models_base.ClipTarget(comfy.text_encoders.flux.Flux2Tokenizer, comfy.text_encoders.flux.flux2_te(**detect))
+
+        return None

 class GenmoMochi(supported_models_base.BASE):
    unet_config = {
@@ -992,6 +1008,36 @@ class CosmosT2IPredict2(supported_models_base.BASE):
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))

+class Anima(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "anima",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Wan21
+
+    memory_usage_factor = 1.0
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Anima(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_06b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.anima.AnimaTokenizer, comfy.text_encoders.anima.te(**detect))
+
 class CosmosI2VPredict2(CosmosT2IPredict2):
    unet_config = {
        "image_model": "cosmos_predict2",
@@ -1048,7 +1094,7 @@ class ZImage(Lumina2):

    def __init__(self, unet_config):
        super().__init__(unet_config)
-        if comfy.model_management.extended_fp16_support():
+        if comfy.model_management.extended_fp16_support() and unet_config.get("allow_fp16", False):
            self.supported_inference_dtypes = self.supported_inference_dtypes.copy()
            self.supported_inference_dtypes.insert(1, torch.float16)

@@ -1551,6 +1597,46 @@ class Kandinsky5Image(Kandinsky5):
        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))


-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
+class ACEStep15(supported_models_base.BASE):
+    unet_config = {
+        "audio_model": "ace1.5",
+    }
+
+    unet_extra_config = {
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.0,
+    }
+
+    latent_format = comfy.latent_formats.ACEAudio15
+
+    memory_usage_factor = 4.7
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.ACEStep15(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        detect_2b = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_2b.transformer.".format(pref))
+        detect_4b = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
+        if "dtype_llama" in detect_2b:
+            detect = detect_2b
+            detect["lm_model"] = "qwen3_2b"
+        elif "dtype_llama" in detect_4b:
+            detect = detect_4b
+            detect["lm_model"] = "qwen3_4b"
+
+        return supported_models_base.ClipTarget(comfy.text_encoders.ace15.ACE15Tokenizer, comfy.text_encoders.ace15.te(**detect))
+
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]

 models += [SVD_img2vid]
@@ -112,7 +112,8 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar):


 class TAEHV(nn.Module):
-    def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True):
+    def __init__(self, latent_channels, parallel=False, encoder_time_downscale=(True, True, False), decoder_time_upscale=(False, True, True), decoder_space_upscale=(True, True, True),
+                 latent_format=None, show_progress_bar=False):
        super().__init__()
        self.image_channels = 3
        self.patch_size = 1
@@ -124,6 +125,9 @@ class TAEHV(nn.Module):
        self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x)
        if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5
            self.patch_size = 2
+        elif self.latent_channels == 128: # LTX2
+            self.patch_size, self.latent_channels, encoder_time_downscale, decoder_time_upscale = 4, 128, (True, True, True), (True, True, True)
+
        if self.latent_channels == 32: # HunyuanVideo1.5
            act_func = nn.LeakyReLU(0.2, inplace=True)
        else: # HunyuanVideo, Wan 2.1
@@ -131,41 +135,52 @@ class TAEHV(nn.Module):

        self.encoder = nn.Sequential(
            conv(self.image_channels*self.patch_size**2, 64), act_func,
-            TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
-            TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
-            TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
+            TPool(64, 2 if encoder_time_downscale[0] else 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
+            TPool(64, 2 if encoder_time_downscale[1] else 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
+            TPool(64, 2 if encoder_time_downscale[2] else 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
            conv(64, self.latent_channels),
        )
        n_f = [256, 128, 64, 64]
-        self.frames_to_trim = 2**sum(decoder_time_upscale) - 1
+
        self.decoder = nn.Sequential(
            Clamp(), conv(self.latent_channels, n_f[0]), act_func,
-            MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False),
-            MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False),
-            MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False),
+            MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 2 if decoder_time_upscale[0] else 1), conv(n_f[0], n_f[1], bias=False),
+            MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[1] else 1), conv(n_f[1], n_f[2], bias=False),
+            MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[2] else 1), conv(n_f[2], n_f[3], bias=False),
            act_func, conv(n_f[3], self.image_channels*self.patch_size**2),
        )
-        @property
-        def show_progress_bar(self):
-            return self._show_progress_bar

-        @show_progress_bar.setter
-        def show_progress_bar(self, value):
-            self._show_progress_bar = value
+        self.t_downscale = 2**sum(t.stride == 2 for t in self.encoder if isinstance(t, TPool))
+        self.t_upscale = 2**sum(t.stride == 2 for t in self.decoder if isinstance(t, TGrow))
+        self.frames_to_trim = self.t_upscale - 1
+        self._show_progress_bar = show_progress_bar
+
+    @property
+    def show_progress_bar(self):
+        return self._show_progress_bar
+
+    @show_progress_bar.setter
+    def show_progress_bar(self, value):
+        self._show_progress_bar = value

    def encode(self, x, **kwargs):
-        if self.patch_size > 1:
-            x = F.pixel_unshuffle(x, self.patch_size)
        x = x.movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
-        if x.shape[1] % 4 != 0:
-            # pad at end to multiple of 4
-            n_pad = 4 - x.shape[1] % 4
+        if self.patch_size > 1:
+            B, T, C, H, W = x.shape
+            x = x.reshape(B * T, C, H, W)
+            x = F.pixel_unshuffle(x, self.patch_size)
+            x = x.reshape(B, T, C * self.patch_size ** 2, H // self.patch_size, W // self.patch_size)
+        if x.shape[1] % self.t_downscale != 0:
+            # pad at end to multiple of t_downscale
+            n_pad = self.t_downscale - x.shape[1] % self.t_downscale
            padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
            x = torch.cat([x, padding], 1)
        x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
        return self.process_out(x)

    def decode(self, x, **kwargs):
+        x = x.unsqueeze(0) if x.ndim == 4 else x  # [T, C, H, W] -> [1, T, C, H, W]
+        x = x.movedim(1, 2) if x.shape[1] != self.latent_channels else x  # [B, T, C, H, W] or [B, C, T, H, W]
        x = self.process_in(x).movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
        x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
        if self.patch_size > 1:
@@ -0,0 +1,249 @@
+from .anima import Qwen3Tokenizer
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+import torch
+import math
+import comfy.utils
+
+
+def sample_manual_loop_no_classes(
+    model,
+    ids=None,
+    paddings=[],
+    execution_dtype=None,
+    cfg_scale: float = 2.0,
+    temperature: float = 0.85,
+    top_p: float = 0.9,
+    top_k: int = None,
+    seed: int = 1,
+    min_tokens: int = 1,
+    max_new_tokens: int = 2048,
+    audio_start_id: int = 151669,  # The cutoff ID for audio codes
+    audio_end_id: int = 215669,
+    eos_token_id: int = 151645,
+):
+    device = model.execution_device
+
+    if execution_dtype is None:
+        if comfy.model_management.should_use_bf16(device):
+            execution_dtype = torch.bfloat16
+        else:
+            execution_dtype = torch.float32
+
+    embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
+    for i, t in enumerate(paddings):
+        attention_mask[i, :t] = 0
+        attention_mask[i, t:] = 1
+
+    output_audio_codes = []
+    past_key_values = []
+    generator = torch.Generator(device=device)
+    generator.manual_seed(seed)
+    model_config = model.transformer.model.config
+
+    for x in range(model_config.num_hidden_layers):
+        past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), 0))
+
+    progress_bar = comfy.utils.ProgressBar(max_new_tokens)
+
+    for step in range(max_new_tokens):
+        outputs = model.transformer(None, attention_mask, embeds=embeds.to(execution_dtype), num_tokens=num_tokens, intermediate_output=None, dtype=execution_dtype, embeds_info=embeds_info, past_key_values=past_key_values)
+        next_token_logits = model.transformer.logits(outputs[0])[:, -1]
+        past_key_values = outputs[2]
+
+        cond_logits = next_token_logits[0:1]
+        uncond_logits = next_token_logits[1:2]
+        cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+
+        if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+            eos_score = cfg_logits[:, eos_token_id].clone()
+
+        remove_logit_value = torch.finfo(cfg_logits.dtype).min
+        # Only generate audio tokens
+        cfg_logits[:, :audio_start_id] = remove_logit_value
+        cfg_logits[:, audio_end_id:] = remove_logit_value
+
+        if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+            cfg_logits[:, eos_token_id] = eos_score
+
+        if top_k is not None and top_k > 0:
+            top_k_vals, _ = torch.topk(cfg_logits, top_k)
+            min_val = top_k_vals[..., -1, None]
+            cfg_logits[cfg_logits < min_val] = remove_logit_value
+
+        if top_p is not None and top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            cfg_logits[indices_to_remove] = remove_logit_value
+
+        if temperature > 0:
+            cfg_logits = cfg_logits / temperature
+            next_token = torch.multinomial(torch.softmax(cfg_logits, dim=-1), num_samples=1, generator=generator).squeeze(1)
+        else:
+            next_token = torch.argmax(cfg_logits, dim=-1)
+
+        token = next_token.item()
+
+        if token == eos_token_id:
+            break
+
+        embed, _, _, _ = model.process_tokens([[token]], device)
+        embeds = embed.repeat(2, 1, 1)
+        attention_mask = torch.cat([attention_mask, torch.ones((2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
+
+        output_audio_codes.append(token - audio_start_id)
+        progress_bar.update_absolute(step)
+
+    return output_audio_codes
+
+
+def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0):
+    cfg_scale = 2.0
+
+    positive = [[token for token, _ in inner_list] for inner_list in positive]
+    negative = [[token for token, _ in inner_list] for inner_list in negative]
+    positive = positive[0]
+    negative = negative[0]
+
+    neg_pad = 0
+    if len(negative) < len(positive):
+        neg_pad = (len(positive) - len(negative))
+        negative = [model.special_tokens["pad"]] * neg_pad + negative
+
+    pos_pad = 0
+    if len(negative) > len(positive):
+        pos_pad = (len(negative) - len(positive))
+        positive = [model.special_tokens["pad"]] * pos_pad + positive
+
+    paddings = [pos_pad, neg_pad]
+    return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+
+
+class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)
+
+    def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
+        out = {}
+        lyrics = kwargs.get("lyrics", "")
+        bpm = kwargs.get("bpm", 120)
+        duration = kwargs.get("duration", 120)
+        keyscale = kwargs.get("keyscale", "C major")
+        timesignature = kwargs.get("timesignature", 2)
+        language = kwargs.get("language", "en")
+        seed = kwargs.get("seed", 0)
+
+        duration = math.ceil(duration)
+        meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
+        lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n<think>\n{}\n</think>\n\n<|im_end|>\n"
+
+        meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
+        out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
+        out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+
+        out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
+        out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
+        out["lm_metadata"] = {"min_tokens": duration * 5, "seed": seed}
+        return out
+
+
+class Qwen3_06BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_06B_ACE15, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+class Qwen3_2B_ACE15(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
+        if llama_quantization_metadata is not None:
+            model_options = model_options.copy()
+            model_options["quantization_metadata"] = llama_quantization_metadata
+
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_2B_ACE15_lm, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+class Qwen3_4B_ACE15(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
+        if llama_quantization_metadata is not None:
+            model_options = model_options.copy()
+            model_options["quantization_metadata"] = llama_quantization_metadata
+
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_4B_ACE15_lm, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+class ACE15TEModel(torch.nn.Module):
+    def __init__(self, device="cpu", dtype=None, dtype_llama=None, lm_model=None, model_options={}):
+        super().__init__()
+        if dtype_llama is None:
+            dtype_llama = dtype
+
+        model = None
+        self.constant = 0.4375
+        if lm_model == "qwen3_4b":
+            model = Qwen3_4B_ACE15
+            self.constant = 0.5625
+        elif lm_model == "qwen3_2b":
+            model = Qwen3_2B_ACE15
+
+        self.lm_model = lm_model
+        self.qwen3_06b = Qwen3_06BModel(device=device, dtype=dtype, model_options=model_options)
+        if model is not None:
+            setattr(self, self.lm_model, model(device=device, dtype=dtype_llama, model_options=model_options))
+
+        self.dtypes = set([dtype, dtype_llama])
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_base = token_weight_pairs["qwen3_06b"]
+        token_weight_pairs_lyrics = token_weight_pairs["lyrics"]
+
+        self.qwen3_06b.set_clip_options({"layer": None})
+        base_out, _, extra = self.qwen3_06b.encode_token_weights(token_weight_pairs_base)
+        self.qwen3_06b.set_clip_options({"layer": [0]})
+        lyrics_embeds, _, extra_l = self.qwen3_06b.encode_token_weights(token_weight_pairs_lyrics)
+
+        lm_metadata = token_weight_pairs["lm_metadata"]
+        audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"])
+
+        return base_out, None, {"conditioning_lyrics": lyrics_embeds[:, 0], "audio_codes": [audio_codes]}
+
+    def set_clip_options(self, options):
+        self.qwen3_06b.set_clip_options(options)
+        lm_model = getattr(self, self.lm_model, None)
+        if lm_model is not None:
+            lm_model.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.qwen3_06b.reset_clip_options()
+        lm_model = getattr(self, self.lm_model, None)
+        if lm_model is not None:
+            lm_model.reset_clip_options()
+
+    def load_sd(self, sd):
+        if "model.layers.0.post_attention_layernorm.weight" in sd:
+            shape = sd["model.layers.0.post_attention_layernorm.weight"].shape
+            if shape[0] == 1024:
+                return self.qwen3_06b.load_sd(sd)
+            else:
+                return getattr(self, self.lm_model).load_sd(sd)
+
+    def memory_estimation_function(self, token_weight_pairs, device=None):
+        lm_metadata = token_weight_pairs["lm_metadata"]
+        constant = self.constant
+        if comfy.model_management.should_use_bf16(device):
+            constant *= 0.5
+
+        token_weight_pairs = token_weight_pairs.get("lm_prompt", [])
+        num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
+        num_tokens += lm_metadata['min_tokens']
+        return num_tokens * constant * 1024 * 1024
+
+def te(dtype_llama=None, llama_quantization_metadata=None, lm_model="qwen3_2b"):
+    class ACE15TEModel_(ACE15TEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
+                model_options["llama_quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype_llama=dtype_llama, lm_model=lm_model, dtype=dtype, model_options=model_options)
+    return ACE15TEModel_
@@ -0,0 +1,61 @@
+from transformers import Qwen2Tokenizer, T5TokenizerFast
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+import os
+import torch
+
+
+class Qwen3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024, embedding_key='qwen3_06b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data)
+
+class AnimaTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        self.qwen3_06b = Qwen3Tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+        out = {}
+        qwen_ids = self.qwen3_06b.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["qwen3_06b"] = [[(token, 1.0) for token, _ in inner_list] for inner_list in qwen_ids]  # Set weights to 1.0
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.t5xxl.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+
+class Qwen3_06BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_06B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+
+class AnimaTEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="qwen3_06b", clip_model=Qwen3_06BModel, model_options=model_options)
+
+    def encode_token_weights(self, token_weight_pairs):
+        out = super().encode_token_weights(token_weight_pairs)
+        out[2]["t5xxl_ids"] = torch.tensor(list(map(lambda a: a[0], token_weight_pairs["t5xxl"][0])), dtype=torch.int)
+        out[2]["t5xxl_weights"] = torch.tensor(list(map(lambda a: a[1], token_weight_pairs["t5xxl"][0])))
+        return out
+
+def te(dtype_llama=None, llama_quantization_metadata=None):
+    class AnimaTEModel_(AnimaTEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return AnimaTEModel_
@@ -118,7 +118,7 @@ class MistralTokenizerClass:
 class Mistral3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        self.tekken_data = tokenizer_data.get("tekken_model", None)
-        super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
+        super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"tekken_model": self.tekken_data}
@@ -176,12 +176,12 @@ def flux2_te(dtype_llama=None, llama_quantization_metadata=None, pruned=False):
 class Qwen3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=151643, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=151643, tokenizer_data=tokenizer_data)

 class Qwen3Tokenizer8B(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=4096, embedding_key='qwen3_8b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=151643, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=4096, embedding_key='qwen3_8b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=151643, tokenizer_data=tokenizer_data)

 class KleinTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}, name="qwen3_4b"):
@@ -10,9 +10,11 @@ import comfy.utils

 def llama_detect(state_dict, prefix=""):
    out = {}
-    t5_key = "{}model.norm.weight".format(prefix)
-    if t5_key in state_dict:
-        out["dtype_llama"] = state_dict[t5_key].dtype
+    norm_keys = ["{}model.norm.weight".format(prefix), "{}model.layers.0.input_layernorm.weight".format(prefix)]
+    for norm_key in norm_keys:
+        if norm_key in state_dict:
+            out["dtype_llama"] = state_dict[norm_key].dtype
+            break

    quant = comfy.utils.detect_layer_quantization(state_dict, prefix)
    if quant is not None:
@@ -1,11 +1,12 @@
 import torch
 import torch.nn as nn
 from dataclasses import dataclass
-from typing import Optional, Any
+from typing import Optional, Any, Tuple
 import math

 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.model_management
+import comfy.ops
 import comfy.ldm.common_dit
 import comfy.clip_model

@@ -32,6 +33,7 @@ class Llama2Config:
    k_norm = None
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Mistral3Small24BConfig:
@@ -54,6 +56,7 @@ class Mistral3Small24BConfig:
    k_norm = None
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Qwen25_3BConfig:
@@ -76,6 +79,99 @@ class Qwen25_3BConfig:
    k_norm = None
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False
+
+@dataclass
+class Qwen3_06BConfig:
+    vocab_size: int = 151936
+    hidden_size: int = 1024
+    intermediate_size: int = 3072
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 32768
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+    lm_head: bool = False
+
+@dataclass
+class Qwen3_06B_ACE15_Config:
+    vocab_size: int = 151669
+    hidden_size: int = 1024
+    intermediate_size: int = 3072
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 32768
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+    lm_head: bool = False
+
+@dataclass
+class Qwen3_2B_ACE15_lm_Config:
+    vocab_size: int = 217204
+    hidden_size: int = 2048
+    intermediate_size: int = 6144
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 40960
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+    lm_head: bool = False
+
+@dataclass
+class Qwen3_4B_ACE15_lm_Config:
+    vocab_size: int = 217204
+    hidden_size: int = 2560
+    intermediate_size: int = 9728
+    num_hidden_layers: int = 36
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 40960
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Qwen3_4BConfig:
@@ -98,6 +194,7 @@ class Qwen3_4BConfig:
    k_norm = "gemma3"
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Qwen3_8BConfig:
@@ -120,6 +217,7 @@ class Qwen3_8BConfig:
    k_norm = "gemma3"
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Ovis25_2BConfig:
@@ -142,6 +240,7 @@ class Ovis25_2BConfig:
    k_norm = "gemma3"
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Qwen25_7BVLI_Config:
@@ -164,6 +263,7 @@ class Qwen25_7BVLI_Config:
    k_norm = None
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Gemma2_2B_Config:
@@ -187,6 +287,7 @@ class Gemma2_2B_Config:
    sliding_attention = None
    rope_scale = None
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Gemma3_4B_Config:
@@ -210,6 +311,7 @@ class Gemma3_4B_Config:
    sliding_attention = [1024, 1024, 1024, 1024, 1024, False]
    rope_scale = [8.0, 1.0]
    final_norm: bool = True
+    lm_head: bool = False

@dataclass
 class Gemma3_12B_Config:
@@ -233,6 +335,7 @@ class Gemma3_12B_Config:
    sliding_attention = [1024, 1024, 1024, 1024, 1024, False]
    rope_scale = [8.0, 1.0]
    final_norm: bool = True
+    lm_head: bool = False
    vision_config = {"num_channels": 3, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "image_size": 896, "intermediate_size": 4304, "model_type": "siglip_vision_model", "num_attention_heads": 16, "num_hidden_layers": 27, "patch_size": 14}
    mm_tokens_per_image = 256

@@ -334,6 +437,7 @@ class Attention(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        freqs_cis: Optional[torch.Tensor] = None,
        optimized_attention=None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ):
        batch_size, seq_length, _ = hidden_states.shape
        xq = self.q_proj(hidden_states)
@@ -351,11 +455,30 @@ class Attention(nn.Module):

        xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis)

+        present_key_value = None
+        if past_key_value is not None:
+            index = 0
+            num_tokens = xk.shape[2]
+            if len(past_key_value) > 0:
+                past_key, past_value, index = past_key_value
+                if past_key.shape[2] >= (index + num_tokens):
+                    past_key[:, :, index:index + xk.shape[2]] = xk
+                    past_value[:, :, index:index + xv.shape[2]] = xv
+                    xk = past_key[:, :, :index + xk.shape[2]]
+                    xv = past_value[:, :, :index + xv.shape[2]]
+                    present_key_value = (past_key, past_value, index + num_tokens)
+                else:
+                    xk = torch.cat((past_key[:, :, :index], xk), dim=2)
+                    xv = torch.cat((past_value[:, :, :index], xv), dim=2)
+                    present_key_value = (xk, xv, index + num_tokens)
+            else:
+                present_key_value = (xk, xv, index + num_tokens)
+
        xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
        xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)

        output = optimized_attention(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True)
-        return self.o_proj(output)
+        return self.o_proj(output), present_key_value

 class MLP(nn.Module):
    def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
@@ -386,15 +509,17 @@ class TransformerBlock(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        freqs_cis: Optional[torch.Tensor] = None,
        optimized_attention=None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ):
        # Self Attention
        residual = x
        x = self.input_layernorm(x)
-        x = self.self_attn(
+        x, present_key_value = self.self_attn(
            hidden_states=x,
            attention_mask=attention_mask,
            freqs_cis=freqs_cis,
            optimized_attention=optimized_attention,
+            past_key_value=past_key_value,
        )
        x = residual + x

@@ -404,7 +529,7 @@ class TransformerBlock(nn.Module):
        x = self.mlp(x)
        x = residual + x

-        return x
+        return x, present_key_value

 class TransformerBlockGemma2(nn.Module):
    def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None):
@@ -429,6 +554,7 @@ class TransformerBlockGemma2(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        freqs_cis: Optional[torch.Tensor] = None,
        optimized_attention=None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ):
        if self.transformer_type == 'gemma3':
            if self.sliding_attention:
@@ -446,11 +572,12 @@ class TransformerBlockGemma2(nn.Module):
        # Self Attention
        residual = x
        x = self.input_layernorm(x)
-        x = self.self_attn(
+        x, present_key_value = self.self_attn(
            hidden_states=x,
            attention_mask=attention_mask,
            freqs_cis=freqs_cis,
            optimized_attention=optimized_attention,
+            past_key_value=past_key_value,
        )

        x = self.post_attention_layernorm(x)
@@ -463,7 +590,7 @@ class TransformerBlockGemma2(nn.Module):
        x = self.post_feedforward_layernorm(x)
        x = residual + x

-        return x
+        return x, present_key_value

 class Llama2_(nn.Module):
    def __init__(self, config, device=None, dtype=None, ops=None):
@@ -494,9 +621,10 @@ class Llama2_(nn.Module):
        else:
            self.norm = None

-        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
+        if config.lm_head:
+            self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)

-    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]):
+    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None):
        if embeds is not None:
            x = embeds
        else:
@@ -505,8 +633,13 @@ class Llama2_(nn.Module):
        if self.normalize_in:
            x *= self.config.hidden_size ** 0.5

+        seq_len = x.shape[1]
+        past_len = 0
+        if past_key_values is not None and len(past_key_values) > 0:
+            past_len = past_key_values[0][2]
+
        if position_ids is None:
-            position_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0)
+            position_ids = torch.arange(past_len, past_len + seq_len, device=x.device).unsqueeze(0)

        freqs_cis = precompute_freqs_cis(self.config.head_dim,
                                         position_ids,
@@ -517,14 +650,16 @@ class Llama2_(nn.Module):

        mask = None
        if attention_mask is not None:
-            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1])
+            mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(x.dtype).min)
+
+        if seq_len > 1:
+            causal_mask = torch.empty(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device).fill_(torch.finfo(x.dtype).min).triu_(1)
+            if mask is not None:
+                mask += causal_mask
+            else:
+                mask = causal_mask

-        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
-        if mask is not None:
-            mask += causal_mask
-        else:
-            mask = causal_mask
        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)

        intermediate = None
@@ -540,16 +675,27 @@ class Llama2_(nn.Module):
            elif intermediate_output < 0:
                intermediate_output = len(self.layers) + intermediate_output

+        next_key_values = []
        for i, layer in enumerate(self.layers):
            if all_intermediate is not None:
                if only_layers is None or (i in only_layers):
                    all_intermediate.append(x.unsqueeze(1).clone())
-            x = layer(
+
+            past_kv = None
+            if past_key_values is not None:
+                past_kv = past_key_values[i] if len(past_key_values) > 0 else []
+
+            x, current_kv = layer(
                x=x,
                attention_mask=mask,
                freqs_cis=freqs_cis,
                optimized_attention=optimized_attention,
+                past_key_value=past_kv,
            )
+
+            if current_kv is not None:
+                next_key_values.append(current_kv)
+
            if i == intermediate_output:
                intermediate = x.clone()

@@ -566,7 +712,10 @@ class Llama2_(nn.Module):
        if intermediate is not None and final_layer_norm_intermediate and self.norm is not None:
            intermediate = self.norm(intermediate)

-        return x, intermediate
+        if len(next_key_values) > 0:
+            return x, intermediate, next_key_values
+        else:
+            return x, intermediate


 class Gemma3MultiModalProjector(torch.nn.Module):
@@ -613,6 +762,21 @@ class BaseLlama:
    def forward(self, input_ids, *args, **kwargs):
        return self.model(input_ids, *args, **kwargs)

+class BaseQwen3:
+    def logits(self, x):
+        input = x[:, -1:]
+        module = self.model.embed_tokens
+
+        offload_stream = None
+        if module.comfy_cast_weights:
+            weight, _, offload_stream = comfy.ops.cast_bias_weight(module, input, offloadable=True)
+        else:
+            weight = self.model.embed_tokens.weight.to(x)
+
+        x = torch.nn.functional.linear(input, weight, None)
+
+        comfy.ops.uncast_bias_weight(module, weight, None, offload_stream)
+        return x

 class Llama2(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
@@ -641,7 +805,34 @@ class Qwen25_3B(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype

-class Qwen3_4B(BaseLlama, torch.nn.Module):
+class Qwen3_06B(BaseLlama, BaseQwen3, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_06BConfig(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+class Qwen3_06B_ACE15(BaseLlama, BaseQwen3, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_06B_ACE15_Config(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+class Qwen3_2B_ACE15_lm(BaseLlama, BaseQwen3, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_2B_ACE15_lm_Config(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+class Qwen3_4B(BaseLlama, BaseQwen3, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        config = Qwen3_4BConfig(**config_dict)
@@ -650,7 +841,16 @@ class Qwen3_4B(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype

-class Qwen3_8B(BaseLlama, torch.nn.Module):
+class Qwen3_4B_ACE15_lm(BaseLlama, BaseQwen3, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_4B_ACE15_lm_Config(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+class Qwen3_8B(BaseLlama, BaseQwen3, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        config = Qwen3_8BConfig(**config_dict)
@@ -118,9 +118,18 @@ class LTXAVTEModel(torch.nn.Module):
            sdo = comfy.utils.state_dict_prefix_replace(sd, {"text_embedding_projection.aggregate_embed.weight": "text_embedding_projection.weight", "model.diffusion_model.video_embeddings_connector.": "video_embeddings_connector.", "model.diffusion_model.audio_embeddings_connector.": "audio_embeddings_connector."}, filter_keys=True)
            if len(sdo) == 0:
                sdo = sd
-            missing, unexpected = self.load_state_dict(sdo, strict=False)
-            missing = [k for k in missing if not k.startswith("gemma3_12b.")] # filter out keys that belong to the main gemma model
-            return (missing, unexpected)
+
+            missing_all = []
+            unexpected_all = []
+
+            for prefix, component in [("text_embedding_projection.", self.text_embedding_projection), ("video_embeddings_connector.", self.video_embeddings_connector), ("audio_embeddings_connector.", self.audio_embeddings_connector)]:
+                component_sd = {k.replace(prefix, ""): v for k, v in sdo.items() if k.startswith(prefix)}
+                if component_sd:
+                    missing, unexpected = component.load_state_dict(component_sd, strict=False, assign=getattr(self, "can_assign_sd", False))
+                    missing_all.extend([f"{prefix}{k}" for k in missing])
+                    unexpected_all.extend([f"{prefix}{k}" for k in unexpected])
+
+            return (missing_all, unexpected_all)

    def memory_estimation_function(self, token_weight_pairs, device=None):
        constant = 6.0
@@ -6,7 +6,7 @@ import os
 class Qwen3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)


 class ZImageTokenizer(sd1_clip.SD1Tokenizer):
@@ -28,9 +28,11 @@ import logging
 import itertools
 from torch.nn.functional import interpolate
 from einops import rearrange
-from comfy.cli_args import args
+from comfy.cli_args import args, enables_dynamic_vram
 import json
 import time
+import mmap
+import warnings

 MMAP_TORCH_FILES = args.mmap_torch_files
 DISABLE_MMAP = args.disable_mmap
@@ -56,21 +58,70 @@ if hasattr(torch.serialization, "add_safe_globals"):  # TODO: this was added in
 else:
    logging.warning("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended as older versions of pytorch are no longer supported.")

+# Current as of safetensors 0.7.0
+_TYPES = {
+    "F64": torch.float64,
+    "F32": torch.float32,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+    "I64": torch.int64,
+    "I32": torch.int32,
+    "I16": torch.int16,
+    "I8": torch.int8,
+    "U8": torch.uint8,
+    "BOOL": torch.bool,
+    "F8_E4M3": torch.float8_e4m3fn,
+    "F8_E5M2": torch.float8_e5m2,
+    "C64": torch.complex64,
+
+    "U64": torch.uint64,
+    "U32": torch.uint32,
+    "U16": torch.uint16,
+}
+
+def load_safetensors(ckpt):
+    f = open(ckpt, "rb")
+    mapping = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+
+    header_size = struct.unpack("<Q", mapping[:8])[0]
+    header = json.loads(mapping[8:8+header_size].decode("utf-8"))
+
+    with warnings.catch_warnings():
+        #We are working with read-only RAM by design
+        warnings.filterwarnings("ignore", message="The given buffer is not writable")
+        data_area = torch.frombuffer(mapping, dtype=torch.uint8)[8 + header_size:]
+
+    sd = {}
+    for name, info in header.items():
+        if name == "__metadata__":
+            continue
+
+        start, end = info["data_offsets"]
+        sd[name] = data_area[start:end].view(_TYPES[info["dtype"]]).view(info["shape"])
+
+    return sd, header.get("__metadata__", {}),
+
+
 def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
    if device is None:
        device = torch.device("cpu")
    metadata = None
    if ckpt.lower().endswith(".safetensors") or ckpt.lower().endswith(".sft"):
        try:
-            with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
-                sd = {}
-                for k in f.keys():
-                    tensor = f.get_tensor(k)
-                    if DISABLE_MMAP:  # TODO: Not sure if this is the best way to bypass the mmap issues
-                        tensor = tensor.to(device=device, copy=True)
-                    sd[k] = tensor
-                if return_metadata:
-                    metadata = f.metadata()
+            if enables_dynamic_vram():
+                sd, metadata = load_safetensors(ckpt)
+                if not return_metadata:
+                    metadata = None
+            else:
+                with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
+                    sd = {}
+                    for k in f.keys():
+                        tensor = f.get_tensor(k)
+                        if DISABLE_MMAP:  # TODO: Not sure if this is the best way to bypass the mmap issues
+                            tensor = tensor.to(device=device, copy=True)
+                        sd[k] = tensor
+                    if return_metadata:
+                        metadata = f.metadata()
        except Exception as e:
            if len(e.args) > 0:
                message = e.args[0]
@@ -611,6 +662,14 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                        "ff_context.net.0.proj.bias": "txt_mlp.0.bias",
                        "ff_context.net.2.weight": "txt_mlp.2.weight",
                        "ff_context.net.2.bias": "txt_mlp.2.bias",
+                        "ff.linear_in.weight": "img_mlp.0.weight",  # LyCoris LoKr
+                        "ff.linear_in.bias": "img_mlp.0.bias",
+                        "ff.linear_out.weight": "img_mlp.2.weight",
+                        "ff.linear_out.bias": "img_mlp.2.bias",
+                        "ff_context.linear_in.weight": "txt_mlp.0.weight",
+                        "ff_context.linear_in.bias": "txt_mlp.0.bias",
+                        "ff_context.linear_out.weight": "txt_mlp.2.weight",
+                        "ff_context.linear_out.bias": "txt_mlp.2.bias",
                        "attn.norm_q.weight": "img_attn.norm.query_norm.scale",
                        "attn.norm_k.weight": "img_attn.norm.key_norm.scale",
                        "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
@@ -1300,3 +1359,16 @@ def convert_old_quants(state_dict, model_prefix="", metadata={}):
            state_dict["{}.comfy_quant".format(k)] = torch.tensor(list(json.dumps(v).encode('utf-8')), dtype=torch.uint8)

    return state_dict, metadata
+
+def string_to_seed(data):
+    crc = 0xFFFFFFFF
+    for byte in data:
+        if isinstance(byte, str):
+            byte = ord(byte)
+        crc ^= byte
+        for _ in range(8):
+            if crc & 1:
+                crc = (crc >> 1) ^ 0xEDB88320
+            else:
+                crc >>= 1
+    return crc ^ 0xFFFFFFFF
@@ -5,6 +5,11 @@ from .lokr import LoKrAdapter
 from .glora import GLoRAAdapter
 from .oft import OFTAdapter
 from .boft import BOFTAdapter
+from .bypass import (
+    BypassInjectionManager,
+    BypassForwardHook,
+    create_bypass_injections_from_patches,
+)


 adapters: list[type[WeightAdapterBase]] = [
@@ -31,4 +36,7 @@ __all__ = [
    "WeightAdapterTrainBase",
    "adapters",
    "adapter_maps",
+    "BypassInjectionManager",
+    "BypassForwardHook",
+    "create_bypass_injections_from_patches",
 ] + [a.__name__ for a in adapters]
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Callable, Optional

 import torch
 import torch.nn as nn
@@ -7,12 +7,35 @@ import comfy.model_management


 class WeightAdapterBase:
+    """
+    Base class for weight adapters (LoRA, LoHa, LoKr, OFT, etc.)
+
+    Bypass Mode:
+        All adapters follow the pattern: bypass(f)(x) = g(f(x) + h(x))
+
+        - h(x): Additive component (LoRA path). Returns delta to add to base output.
+        - g(y): Output transformation. Applied after base + h(x).
+
+        For LoRA/LoHa/LoKr: g = identity, h = adapter(x)
+        For OFT/BOFT: g = transform, h = 0
+    """
+
    name: str
    loaded_keys: set[str]
    weights: list[torch.Tensor]

+    # Attributes set by bypass system
+    multiplier: float = 1.0
+    shape: tuple = None  # (out_features, in_features) or (out_ch, in_ch, *kernel)
+
    @classmethod
-    def load(cls, x: str, lora: dict[str, torch.Tensor], alpha: float, dora_scale: torch.Tensor) -> Optional["WeightAdapterBase"]:
+    def load(
+        cls,
+        x: str,
+        lora: dict[str, torch.Tensor],
+        alpha: float,
+        dora_scale: torch.Tensor,
+    ) -> Optional["WeightAdapterBase"]:
        raise NotImplementedError

    def to_train(self) -> "WeightAdapterTrainBase":
@@ -39,18 +62,202 @@ class WeightAdapterBase:
    ):
        raise NotImplementedError

+    # ===== Bypass Mode Methods =====
+    #
+    # IMPORTANT: Bypass mode is designed for quantized models where original weights
+    # may not be accessible in a usable format. Therefore, h() and bypass_forward()
+    # do NOT take org_weight as a parameter. All necessary information (out_channels,
+    # in_channels, conv params, etc.) is provided via attributes set by BypassForwardHook.
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component: h(x, base_out)
+
+        Computes the adapter's contribution to be added to base forward output.
+        For adapters that only transform output (OFT/BOFT), returns zeros.
+
+        Note:
+            This method does NOT access original model weights. Bypass mode is
+            designed for quantized models where weights may not be in a usable format.
+            All shape info comes from module attributes set by BypassForwardHook.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward f(x), can be used for shape reference
+
+        Returns:
+            Delta tensor to add to base output. Shape matches base output.
+
+        Reference: LyCORIS LoConModule.bypass_forward_diff
+        """
+        # Default: no additive component (for OFT/BOFT)
+        # Simply return zeros matching base_out shape
+        return torch.zeros_like(base_out)
+
+    def g(self, y: torch.Tensor) -> torch.Tensor:
+        """
+        Output transformation: g(y)
+
+        Applied after base forward + h(x). For most adapters this is identity.
+        OFT/BOFT override this to apply orthogonal transformation.
+
+        Args:
+            y: Combined output (base + h(x))
+
+        Returns:
+            Transformed output
+
+        Reference: LyCORIS OFTModule applies orthogonal transform here
+        """
+        # Default: identity (for LoRA/LoHa/LoKr)
+        return y
+
+    def bypass_forward(
+        self,
+        org_forward: Callable,
+        x: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Full bypass forward: g(f(x) + h(x, f(x)))
+
+        Note:
+            This method does NOT take org_weight/org_bias parameters. Bypass mode
+            is designed for quantized models where weights may not be accessible.
+            The original forward function handles weight access internally.
+
+        Args:
+            org_forward: Original module forward function
+            x: Input tensor
+            *args, **kwargs: Additional arguments for org_forward
+
+        Returns:
+            Output with adapter applied in bypass mode
+
+        Reference: LyCORIS LoConModule.bypass_forward
+        """
+        # Base forward: f(x)
+        base_out = org_forward(x, *args, **kwargs)
+
+        # Additive component: h(x, base_out) - base_out provided for shape reference
+        h_out = self.h(x, base_out)
+
+        # Output transformation: g(base + h)
+        return self.g(base_out + h_out)
+

 class WeightAdapterTrainBase(nn.Module):
-    # We follow the scheme of PR #7032
+    """
+    Base class for trainable weight adapters (LoRA, LoHa, LoKr, OFT, etc.)
+
+    Bypass Mode:
+        All adapters follow the pattern: bypass(f)(x) = g(f(x) + h(x))
+
+        - h(x): Additive component (LoRA path). Returns delta to add to base output.
+        - g(y): Output transformation. Applied after base + h(x).
+
+        For LoRA/LoHa/LoKr: g = identity, h = adapter(x)
+        For OFT: g = transform, h = 0
+
+    Note:
+        Unlike WeightAdapterBase, TrainBase classes have simplified weight formats
+        with fewer branches (e.g., LoKr only has w1/w2, not w1_a/w1_b decomposition).
+
+    We follow the scheme of PR #7032
+    """
+
+    # Attributes set by bypass system (BypassForwardHook)
+    # These are set before h()/g()/bypass_forward() are called
+    multiplier: float = 1.0
+    is_conv: bool = False
+    conv_dim: int = 0  # 0=linear, 1=conv1d, 2=conv2d, 3=conv3d
+    kw_dict: dict = {}  # Conv kwargs: stride, padding, dilation, groups
+    kernel_size: tuple = ()
+    in_channels: int = None
+    out_channels: int = None
+
    def __init__(self):
        super().__init__()

    def __call__(self, w):
        """
-        w: The original weight tensor to be modified.
+        Weight modification mode: returns modified weight.
+
+        Args:
+            w: The original weight tensor to be modified.
+
+        Returns:
+            Modified weight tensor.
        """
        raise NotImplementedError

+    # ===== Bypass Mode Methods =====
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component: h(x, base_out)
+
+        Computes the adapter's contribution to be added to base forward output.
+        For adapters that only transform output (OFT), returns zeros.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward f(x), can be used for shape reference
+
+        Returns:
+            Delta tensor to add to base output. Shape matches base output.
+
+        Subclasses should override this method.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__}.h() not implemented. "
+            "Subclasses must implement h() for bypass mode."
+        )
+
+    def g(self, y: torch.Tensor) -> torch.Tensor:
+        """
+        Output transformation: g(y)
+
+        Applied after base forward + h(x). For most adapters this is identity.
+        OFT overrides this to apply orthogonal transformation.
+
+        Args:
+            y: Combined output (base + h(x))
+
+        Returns:
+            Transformed output
+        """
+        # Default: identity (for LoRA/LoHa/LoKr)
+        return y
+
+    def bypass_forward(
+        self,
+        org_forward: Callable,
+        x: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Full bypass forward: g(f(x) + h(x, f(x)))
+
+        Args:
+            org_forward: Original module forward function
+            x: Input tensor
+            *args, **kwargs: Additional arguments for org_forward
+
+        Returns:
+            Output with adapter applied in bypass mode
+        """
+        # Base forward: f(x)
+        base_out = org_forward(x, *args, **kwargs)
+
+        # Additive component: h(x, base_out) - base_out provided for shape reference
+        h_out = self.h(x, base_out)
+
+        # Output transformation: g(base + h)
+        return self.g(base_out + h_out)
+
    def passive_memory_usage(self):
        raise NotImplementedError("passive_memory_usage is not implemented")

@@ -59,8 +266,12 @@ class WeightAdapterTrainBase(nn.Module):
        return self.passive_memory_usage()


-def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function):
-    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype)
+def weight_decompose(
+    dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function
+):
+    dora_scale = comfy.model_management.cast_to_device(
+        dora_scale, weight.device, intermediate_dtype
+    )
    lora_diff *= alpha
    weight_calc = weight + function(lora_diff).type(weight.dtype)

@@ -106,10 +317,14 @@ def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Ten
        the original tensor will be truncated in that dimension.
    """
    if any([new_shape[i] < tensor.shape[i] for i in range(len(new_shape))]):
-        raise ValueError("The new shape must be larger than the original tensor in all dimensions")
+        raise ValueError(
+            "The new shape must be larger than the original tensor in all dimensions"
+        )

    if len(new_shape) != len(tensor.shape):
-        raise ValueError("The new shape must have the same number of dimensions as the original tensor")
+        raise ValueError(
+            "The new shape must have the same number of dimensions as the original tensor"
+        )

    # Create a new tensor filled with zeros
    padded_tensor = torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device)
@@ -62,9 +62,13 @@ class BOFTAdapter(WeightAdapterBase):
        alpha = v[2]
        dora_scale = v[3]

-        blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype)
+        blocks = comfy.model_management.cast_to_device(
+            blocks, weight.device, intermediate_dtype
+        )
        if rescale is not None:
-            rescale = comfy.model_management.cast_to_device(rescale, weight.device, intermediate_dtype)
+            rescale = comfy.model_management.cast_to_device(
+                rescale, weight.device, intermediate_dtype
+            )

        boft_m, block_num, boft_b, *_ = blocks.shape

@@ -74,7 +78,7 @@ class BOFTAdapter(WeightAdapterBase):
            # for Q = -Q^T
            q = blocks - blocks.transpose(-1, -2)
            normed_q = q
-            if alpha > 0: # alpha in boft/bboft is for constraint
+            if alpha > 0:  # alpha in boft/bboft is for constraint
                q_norm = torch.norm(q) + 1e-8
                if q_norm > alpha:
                    normed_q = q * alpha / q_norm
@@ -83,13 +87,13 @@ class BOFTAdapter(WeightAdapterBase):
            r = r.to(weight)
            inp = org = weight

-            r_b = boft_b//2
+            r_b = boft_b // 2
            for i in range(boft_m):
                bi = r[i]
                g = 2
                k = 2**i * r_b
                if strength != 1:
-                    bi = bi * strength + (1-strength) * I
+                    bi = bi * strength + (1 - strength) * I
                inp = (
                    inp.unflatten(0, (-1, g, k))
                    .transpose(1, 2)
@@ -98,18 +102,117 @@ class BOFTAdapter(WeightAdapterBase):
                )
                inp = torch.einsum("b i j, b j ...-> b i ...", bi, inp)
                inp = (
-                    inp.flatten(0, 1).unflatten(0, (-1, k, g)).transpose(1, 2).flatten(0, 2)
+                    inp.flatten(0, 1)
+                    .unflatten(0, (-1, k, g))
+                    .transpose(1, 2)
+                    .flatten(0, 2)
                )

            if rescale is not None:
                inp = inp * rescale

            lora_diff = inp - org
-            lora_diff = comfy.model_management.cast_to_device(lora_diff, weight.device, intermediate_dtype)
+            lora_diff = comfy.model_management.cast_to_device(
+                lora_diff, weight.device, intermediate_dtype
+            )
            if dora_scale is not None:
-                weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                weight = weight_decompose(
+                    dora_scale,
+                    weight,
+                    lora_diff,
+                    alpha,
+                    strength,
+                    intermediate_dtype,
+                    function,
+                )
            else:
                weight += function((strength * lora_diff).type(weight.dtype))
        except Exception as e:
            logging.error("ERROR {} {} {}".format(self.name, key, e))
        return weight
+
+    def _get_orthogonal_matrices(self, device, dtype):
+        """Compute the orthogonal rotation matrices R from BOFT blocks."""
+        v = self.weights
+        blocks = v[0].to(device=device, dtype=dtype)
+        alpha = v[2]
+        if alpha is None:
+            alpha = 0
+
+        boft_m, block_num, boft_b, _ = blocks.shape
+        I = torch.eye(boft_b, device=device, dtype=dtype)
+
+        # Q = blocks - blocks^T (skew-symmetric)
+        q = blocks - blocks.transpose(-1, -2)
+        normed_q = q
+
+        # Apply constraint if alpha > 0
+        if alpha > 0:
+            q_norm = torch.norm(q) + 1e-8
+            if q_norm > alpha:
+                normed_q = q * alpha / q_norm
+
+        # Cayley transform: R = (I + Q)(I - Q)^-1
+        r = (I + normed_q) @ (I - normed_q).float().inverse()
+        return r, boft_m, boft_b
+
+    def g(self, y: torch.Tensor) -> torch.Tensor:
+        """
+        Output transformation for BOFT: applies butterfly orthogonal transform.
+
+        BOFT uses multiple stages of butterfly-structured orthogonal transforms.
+
+        Reference: LyCORIS ButterflyOFTModule._bypass_forward
+        """
+        v = self.weights
+        rescale = v[1]
+
+        r, boft_m, boft_b = self._get_orthogonal_matrices(y.device, y.dtype)
+        r_b = boft_b // 2
+
+        # Apply multiplier
+        multiplier = getattr(self, "multiplier", 1.0)
+        I = torch.eye(boft_b, device=y.device, dtype=y.dtype)
+
+        # Use module info from bypass injection to determine conv vs linear
+        is_conv = getattr(self, "is_conv", y.dim() > 2)
+
+        if is_conv:
+            # Conv output: (N, C, H, W, ...) -> transpose to (N, H, W, ..., C)
+            y = y.transpose(1, -1)
+
+        # Apply butterfly transform stages
+        inp = y
+        for i in range(boft_m):
+            bi = r[i]  # (block_num, boft_b, boft_b)
+            g = 2
+            k = 2**i * r_b
+
+            # Interpolate with identity based on multiplier
+            if multiplier != 1:
+                bi = bi * multiplier + (1 - multiplier) * I
+
+            # Reshape for butterfly: unflatten last dim, transpose, flatten, unflatten
+            inp = (
+                inp.unflatten(-1, (-1, g, k))
+                .transpose(-2, -1)
+                .flatten(-3)
+                .unflatten(-1, (-1, boft_b))
+            )
+            # Apply block-diagonal orthogonal transform
+            inp = torch.einsum("b i j, ... b j -> ... b i", bi, inp)
+            # Reshape back
+            inp = (
+                inp.flatten(-2).unflatten(-1, (-1, k, g)).transpose(-2, -1).flatten(-3)
+            )
+
+        # Apply rescale if present
+        if rescale is not None:
+            rescale = rescale.to(device=y.device, dtype=y.dtype)
+            inp = inp * rescale.transpose(0, -1)
+
+        if is_conv:
+            # Transpose back: (N, H, W, ..., C) -> (N, C, H, W, ...)
+            inp = inp.transpose(1, -1)
+
+        return inp
@@ -0,0 +1,437 @@
+"""
+Bypass mode implementation for weight adapters (LoRA, LoKr, LoHa, etc.)
+
+Bypass mode applies adapters during forward pass without modifying base weights:
+    bypass(f)(x) = g(f(x) + h(x))
+
+Where:
+    - f(x): Original layer forward
+    - h(x): Additive component from adapter (LoRA path)
+    - g(y): Output transformation (identity for most adapters)
+
+This is useful for:
+    - Training with gradient checkpointing
+    - Avoiding weight modifications when weights are offloaded
+    - Supporting multiple adapters with different strengths dynamically
+"""
+
+import logging
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from .base import WeightAdapterBase, WeightAdapterTrainBase
+from comfy.patcher_extension import PatcherInjection
+
+# Type alias for adapters that support bypass mode
+BypassAdapter = Union[WeightAdapterBase, WeightAdapterTrainBase]
+
+
+def get_module_type_info(module: nn.Module) -> dict:
+    """
+    Determine module type and extract conv parameters from module class.
+
+    This is more reliable than checking weight.ndim, especially for quantized layers
+    where weight shape might be different.
+
+    Returns:
+        dict with keys: is_conv, conv_dim, stride, padding, dilation, groups
+    """
+    info = {
+        "is_conv": False,
+        "conv_dim": 0,
+        "stride": (1,),
+        "padding": (0,),
+        "dilation": (1,),
+        "groups": 1,
+        "kernel_size": (1,),
+        "in_channels": None,
+        "out_channels": None,
+    }
+
+    # Determine conv type
+    if isinstance(module, nn.Conv1d):
+        info["is_conv"] = True
+        info["conv_dim"] = 1
+    elif isinstance(module, nn.Conv2d):
+        info["is_conv"] = True
+        info["conv_dim"] = 2
+    elif isinstance(module, nn.Conv3d):
+        info["is_conv"] = True
+        info["conv_dim"] = 3
+    elif isinstance(module, nn.Linear):
+        info["is_conv"] = False
+        info["conv_dim"] = 0
+    else:
+        # Try to infer from class name for custom/quantized layers
+        class_name = type(module).__name__.lower()
+        if "conv3d" in class_name:
+            info["is_conv"] = True
+            info["conv_dim"] = 3
+        elif "conv2d" in class_name:
+            info["is_conv"] = True
+            info["conv_dim"] = 2
+        elif "conv1d" in class_name:
+            info["is_conv"] = True
+            info["conv_dim"] = 1
+        elif "conv" in class_name:
+            info["is_conv"] = True
+            info["conv_dim"] = 2
+
+    # Extract conv parameters if it's a conv layer
+    if info["is_conv"]:
+        # Try to get stride, padding, dilation, groups, kernel_size from module
+        info["stride"] = getattr(module, "stride", (1,) * info["conv_dim"])
+        info["padding"] = getattr(module, "padding", (0,) * info["conv_dim"])
+        info["dilation"] = getattr(module, "dilation", (1,) * info["conv_dim"])
+        info["groups"] = getattr(module, "groups", 1)
+        info["kernel_size"] = getattr(module, "kernel_size", (1,) * info["conv_dim"])
+        info["in_channels"] = getattr(module, "in_channels", None)
+        info["out_channels"] = getattr(module, "out_channels", None)
+
+        # Ensure they're tuples
+        if isinstance(info["stride"], int):
+            info["stride"] = (info["stride"],) * info["conv_dim"]
+        if isinstance(info["padding"], int):
+            info["padding"] = (info["padding"],) * info["conv_dim"]
+        if isinstance(info["dilation"], int):
+            info["dilation"] = (info["dilation"],) * info["conv_dim"]
+        if isinstance(info["kernel_size"], int):
+            info["kernel_size"] = (info["kernel_size"],) * info["conv_dim"]
+
+    return info
+
+
+class BypassForwardHook:
+    """
+    Hook that wraps a layer's forward to apply adapter in bypass mode.
+
+    Stores the original forward and replaces it with bypass version.
+
+    Supports both:
+        - WeightAdapterBase: Inference adapters (uses self.weights tuple)
+        - WeightAdapterTrainBase: Training adapters (nn.Module with parameters)
+    """
+
+    def __init__(
+        self,
+        module: nn.Module,
+        adapter: BypassAdapter,
+        multiplier: float = 1.0,
+    ):
+        self.module = module
+        self.adapter = adapter
+        self.multiplier = multiplier
+        self.original_forward = None
+
+        # Determine layer type and conv params from module class (works for quantized layers)
+        module_info = get_module_type_info(module)
+
+        # Set multiplier and layer type info on adapter for use in h()
+        adapter.multiplier = multiplier
+        adapter.is_conv = module_info["is_conv"]
+        adapter.conv_dim = module_info["conv_dim"]
+        adapter.kernel_size = module_info["kernel_size"]
+        adapter.in_channels = module_info["in_channels"]
+        adapter.out_channels = module_info["out_channels"]
+        # Store kw_dict for conv operations (like LyCORIS extra_args)
+        if module_info["is_conv"]:
+            adapter.kw_dict = {
+                "stride": module_info["stride"],
+                "padding": module_info["padding"],
+                "dilation": module_info["dilation"],
+                "groups": module_info["groups"],
+            }
+        else:
+            adapter.kw_dict = {}
+
+    def _bypass_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """Bypass forward: uses adapter's bypass_forward or default g(f(x) + h(x))
+
+        Note:
+            Bypass mode does NOT access original model weights (org_weight).
+            This is intentional - bypass mode is designed for quantized models
+            where weights may not be in a usable format. All necessary shape
+            information is provided via adapter attributes set during inject().
+        """
+        # Check if adapter has custom bypass_forward (e.g., GLoRA)
+        adapter_bypass = getattr(self.adapter, "bypass_forward", None)
+        if adapter_bypass is not None:
+            # Check if it's overridden (not the base class default)
+            # Need to check both base classes since adapter could be either type
+            adapter_type = type(self.adapter)
+            is_default_bypass = (
+                adapter_type.bypass_forward is WeightAdapterBase.bypass_forward
+                or adapter_type.bypass_forward is WeightAdapterTrainBase.bypass_forward
+            )
+            if not is_default_bypass:
+                return adapter_bypass(self.original_forward, x, *args, **kwargs)
+
+        # Default bypass: g(f(x) + h(x, f(x)))
+        base_out = self.original_forward(x, *args, **kwargs)
+        h_out = self.adapter.h(x, base_out)
+        return self.adapter.g(base_out + h_out)
+
+    def inject(self):
+        """Replace module forward with bypass version."""
+        if self.original_forward is not None:
+            logging.debug(
+                f"[BypassHook] Already injected for {type(self.module).__name__}"
+            )
+            return  # Already injected
+
+        # Move adapter weights to module's device to avoid CPU-GPU transfer on every forward
+        device = None
+        dtype = None
+        if hasattr(self.module, "weight") and self.module.weight is not None:
+            device = self.module.weight.device
+            dtype = self.module.weight.dtype
+        elif hasattr(self.module, "W_q"):  # Quantized layers might use different attr
+            device = self.module.W_q.device
+            dtype = self.module.W_q.dtype
+
+        if device is not None:
+            self._move_adapter_weights_to_device(device, dtype)
+
+        self.original_forward = self.module.forward
+        self.module.forward = self._bypass_forward
+        logging.debug(
+            f"[BypassHook] Injected bypass forward for {type(self.module).__name__} (adapter={type(self.adapter).__name__})"
+        )
+
+    def _move_adapter_weights_to_device(self, device, dtype=None):
+        """Move adapter weights to specified device to avoid per-forward transfers.
+
+        Handles both:
+            - WeightAdapterBase: has self.weights tuple of tensors
+            - WeightAdapterTrainBase: nn.Module with parameters, uses .to() method
+        """
+        adapter = self.adapter
+
+        # Check if adapter is an nn.Module (WeightAdapterTrainBase)
+        if isinstance(adapter, nn.Module):
+            # In training mode we don't touch dtype as trainer will handle it
+            adapter.to(device=device)
+            logging.debug(
+                f"[BypassHook] Moved training adapter (nn.Module) to {device}"
+            )
+            return
+
+        # WeightAdapterBase: handle self.weights tuple
+        if not hasattr(adapter, "weights") or adapter.weights is None:
+            return
+
+        weights = adapter.weights
+        if isinstance(weights, (list, tuple)):
+            new_weights = []
+            for w in weights:
+                if isinstance(w, torch.Tensor):
+                    if dtype is not None:
+                        new_weights.append(w.to(device=device, dtype=dtype))
+                    else:
+                        new_weights.append(w.to(device=device))
+                else:
+                    new_weights.append(w)
+            adapter.weights = (
+                tuple(new_weights) if isinstance(weights, tuple) else new_weights
+            )
+        elif isinstance(weights, torch.Tensor):
+            if dtype is not None:
+                adapter.weights = weights.to(device=device, dtype=dtype)
+            else:
+                adapter.weights = weights.to(device=device)
+
+        logging.debug(f"[BypassHook] Moved adapter weights to {device}")
+
+    def eject(self):
+        """Restore original module forward."""
+        if self.original_forward is None:
+            logging.debug(f"[BypassHook] Not injected for {type(self.module).__name__}")
+            return  # Not injected
+
+        self.module.forward = self.original_forward
+        self.original_forward = None
+        logging.debug(
+            f"[BypassHook] Ejected bypass forward for {type(self.module).__name__}"
+        )
+
+
+class BypassInjectionManager:
+    """
+    Manages bypass mode injection for a collection of adapters.
+
+    Creates PatcherInjection objects that can be used with ModelPatcher.
+
+    Supports both inference adapters (WeightAdapterBase) and training adapters
+    (WeightAdapterTrainBase).
+
+    Usage:
+        manager = BypassInjectionManager()
+        manager.add_adapter("model.layers.0.self_attn.q_proj", lora_adapter, strength=0.8)
+        manager.add_adapter("model.layers.0.self_attn.k_proj", lora_adapter, strength=0.8)
+
+        injections = manager.create_injections(model)
+        model_patcher.set_injections("bypass_lora", injections)
+    """
+
+    def __init__(self):
+        self.adapters: dict[str, tuple[BypassAdapter, float]] = {}
+        self.hooks: list[BypassForwardHook] = []
+
+    def add_adapter(
+        self,
+        key: str,
+        adapter: BypassAdapter,
+        strength: float = 1.0,
+    ):
+        """
+        Add an adapter for a specific weight key.
+
+        Args:
+            key: Weight key (e.g., "model.layers.0.self_attn.q_proj.weight")
+            adapter: The weight adapter (LoRAAdapter, LoKrAdapter, etc.)
+            strength: Multiplier for adapter effect
+        """
+        # Remove .weight suffix if present for module lookup
+        module_key = key
+        if module_key.endswith(".weight"):
+            module_key = module_key[:-7]
+            logging.debug(
+                f"[BypassManager] Stripped .weight suffix: {key} -> {module_key}"
+            )
+
+        self.adapters[module_key] = (adapter, strength)
+        logging.debug(
+            f"[BypassManager] Added adapter: {module_key} (type={type(adapter).__name__}, strength={strength})"
+        )
+
+    def clear_adapters(self):
+        """Remove all adapters."""
+        self.adapters.clear()
+
+    def _get_module_by_key(self, model: nn.Module, key: str) -> Optional[nn.Module]:
+        """Get a submodule by dot-separated key."""
+        parts = key.split(".")
+        module = model
+        try:
+            for i, part in enumerate(parts):
+                if part.isdigit():
+                    module = module[int(part)]
+                else:
+                    module = getattr(module, part)
+            logging.debug(
+                f"[BypassManager] Found module for key {key}: {type(module).__name__}"
+            )
+            return module
+        except (AttributeError, IndexError, KeyError) as e:
+            logging.error(f"[BypassManager] Failed to find module for key {key}: {e}")
+            logging.error(
+                f"[BypassManager] Failed at part index {i}, part={part}, current module type={type(module).__name__}"
+            )
+            return None
+
+    def create_injections(self, model: nn.Module) -> list[PatcherInjection]:
+        """
+        Create PatcherInjection objects for all registered adapters.
+
+        Args:
+            model: The model to inject into (e.g., model_patcher.model)
+
+        Returns:
+            List of PatcherInjection objects to use with model_patcher.set_injections()
+        """
+        self.hooks.clear()
+
+        logging.debug(
+            f"[BypassManager] create_injections called with {len(self.adapters)} adapters"
+        )
+        logging.debug(f"[BypassManager] Model type: {type(model).__name__}")
+
+        for key, (adapter, strength) in self.adapters.items():
+            logging.debug(f"[BypassManager] Looking for module: {key}")
+            module = self._get_module_by_key(model, key)
+
+            if module is None:
+                logging.warning(f"[BypassManager] Module not found for key {key}")
+                continue
+
+            if not hasattr(module, "weight"):
+                logging.warning(
+                    f"[BypassManager] Module {key} has no weight attribute (type={type(module).__name__})"
+                )
+                continue
+
+            logging.debug(
+                f"[BypassManager] Creating hook for {key} (module type={type(module).__name__}, weight shape={module.weight.shape})"
+            )
+            hook = BypassForwardHook(module, adapter, multiplier=strength)
+            self.hooks.append(hook)
+
+        logging.debug(f"[BypassManager] Created {len(self.hooks)} hooks")
+
+        # Create single injection that manages all hooks
+        def inject_all(model_patcher):
+            logging.debug(
+                f"[BypassManager] inject_all called, injecting {len(self.hooks)} hooks"
+            )
+            for hook in self.hooks:
+                hook.inject()
+                logging.debug(
+                    f"[BypassManager] Injected hook for {type(hook.module).__name__}"
+                )
+
+        def eject_all(model_patcher):
+            logging.debug(
+                f"[BypassManager] eject_all called, ejecting {len(self.hooks)} hooks"
+            )
+            for hook in self.hooks:
+                hook.eject()
+
+        return [PatcherInjection(inject=inject_all, eject=eject_all)]
+
+    def get_hook_count(self) -> int:
+        """Return number of hooks that will be/are injected."""
+        return len(self.hooks)
+
+
+def create_bypass_injections_from_patches(
+    model: nn.Module,
+    patches: dict,
+    strength: float = 1.0,
+) -> list[PatcherInjection]:
+    """
+    Convenience function to create bypass injections from a patches dict.
+
+    This is useful when you have patches in the format used by model_patcher.add_patches()
+    and want to apply them in bypass mode instead.
+
+    Args:
+        model: The model to inject into
+        patches: Dict mapping weight keys to adapter data
+        strength: Global strength multiplier
+
+    Returns:
+        List of PatcherInjection objects
+    """
+    manager = BypassInjectionManager()
+
+    for key, patch_list in patches.items():
+        if not patch_list:
+            continue
+
+        # patches format: list of (strength_patch, patch_data, strength_model, offset, function)
+        for patch in patch_list:
+            patch_strength, patch_data, strength_model, offset, function = patch
+
+            # patch_data should be a WeightAdapterBase/WeightAdapterTrainBase or tuple
+            if isinstance(patch_data, (WeightAdapterBase, WeightAdapterTrainBase)):
+                adapter = patch_data
+            else:
+                # Skip non-adapter patches
+                continue
+
+            combined_strength = strength * patch_strength
+            manager.add_adapter(key, adapter, strength=combined_strength)
+
+    return manager.create_injections(model)
@@ -1,7 +1,8 @@
 import logging
-from typing import Optional
+from typing import Callable, Optional

 import torch
+import torch.nn.functional as F
 import comfy.model_management
 from .base import WeightAdapterBase, weight_decompose

@@ -29,7 +30,14 @@ class GLoRAAdapter(WeightAdapterBase):
        b1_name = "{}.b1.weight".format(x)
        b2_name = "{}.b2.weight".format(x)
        if a1_name in lora:
-            weights = (lora[a1_name], lora[a2_name], lora[b1_name], lora[b2_name], alpha, dora_scale)
+            weights = (
+                lora[a1_name],
+                lora[a2_name],
+                lora[b1_name],
+                lora[b2_name],
+                alpha,
+                dora_scale,
+            )
            loaded_keys.add(a1_name)
            loaded_keys.add(a2_name)
            loaded_keys.add(b1_name)
@@ -58,16 +66,28 @@ class GLoRAAdapter(WeightAdapterBase):
            old_glora = True

        if v[3].shape[0] == v[2].shape[1] == v[0].shape[1] == v[1].shape[0]:
-            if old_glora and v[1].shape[0] == weight.shape[0] and weight.shape[0] == weight.shape[1]:
+            if (
+                old_glora
+                and v[1].shape[0] == weight.shape[0]
+                and weight.shape[0] == weight.shape[1]
+            ):
                pass
            else:
                old_glora = False
                rank = v[1].shape[0]

-        a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype)
-        a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype)
-        b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype)
-        b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype)
+        a1 = comfy.model_management.cast_to_device(
+            v[0].flatten(start_dim=1), weight.device, intermediate_dtype
+        )
+        a2 = comfy.model_management.cast_to_device(
+            v[1].flatten(start_dim=1), weight.device, intermediate_dtype
+        )
+        b1 = comfy.model_management.cast_to_device(
+            v[2].flatten(start_dim=1), weight.device, intermediate_dtype
+        )
+        b2 = comfy.model_management.cast_to_device(
+            v[3].flatten(start_dim=1), weight.device, intermediate_dtype
+        )

        if v[4] is not None:
            alpha = v[4] / rank
@@ -76,18 +96,195 @@ class GLoRAAdapter(WeightAdapterBase):

        try:
            if old_glora:
-                lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2), a1)).reshape(weight.shape) #old lycoris glora
+                lora_diff = (
+                    torch.mm(b2, b1)
+                    + torch.mm(
+                        torch.mm(
+                            weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2
+                        ),
+                        a1,
+                    )
+                ).reshape(
+                    weight.shape
+                )  # old lycoris glora
            else:
                if weight.dim() > 2:
-                    lora_diff = torch.einsum("o i ..., i j -> o j ...", torch.einsum("o i ..., i j -> o j ...", weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
+                    lora_diff = torch.einsum(
+                        "o i ..., i j -> o j ...",
+                        torch.einsum(
+                            "o i ..., i j -> o j ...",
+                            weight.to(dtype=intermediate_dtype),
+                            a1,
+                        ),
+                        a2,
+                    ).reshape(weight.shape)
                else:
-                    lora_diff = torch.mm(torch.mm(weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
+                    lora_diff = torch.mm(
+                        torch.mm(weight.to(dtype=intermediate_dtype), a1), a2
+                    ).reshape(weight.shape)
                lora_diff += torch.mm(b1, b2).reshape(weight.shape)

            if dora_scale is not None:
-                weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                weight = weight_decompose(
+                    dora_scale,
+                    weight,
+                    lora_diff,
+                    alpha,
+                    strength,
+                    intermediate_dtype,
+                    function,
+                )
            else:
                weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
        except Exception as e:
            logging.error("ERROR {} {} {}".format(self.name, key, e))
        return weight
+
+    def _compute_paths(self, x: torch.Tensor):
+        """
+        Compute A path and B path outputs for GLoRA bypass.
+
+        GLoRA: f(x) = Wx + WAx + Bx
+        - A path: a1(a2(x)) - modifies input to base forward
+        - B path: b1(b2(x)) - additive component
+
+        Note:
+            Does not access original model weights - bypass mode is designed
+            for quantized models where weights may not be accessible.
+
+        Returns: (a_out, b_out)
+        """
+        v = self.weights
+        # v = (a1, a2, b1, b2, alpha, dora_scale)
+        a1 = v[0]
+        a2 = v[1]
+        b1 = v[2]
+        b2 = v[3]
+        alpha = v[4]
+
+        dtype = x.dtype
+
+        # Cast dtype (weights should already be on correct device from inject())
+        a1 = a1.to(dtype=dtype)
+        a2 = a2.to(dtype=dtype)
+        b1 = b1.to(dtype=dtype)
+        b2 = b2.to(dtype=dtype)
+
+        # Determine rank and scale
+        # Check for old vs new glora format
+        old_glora = False
+        if b2.shape[1] == b1.shape[0] == a1.shape[0] == a2.shape[1]:
+            rank = a1.shape[0]
+            old_glora = True
+
+        if b2.shape[0] == b1.shape[1] == a1.shape[1] == a2.shape[0]:
+            if old_glora and a2.shape[0] == x.shape[-1] and x.shape[-1] == x.shape[-1]:
+                pass
+            else:
+                old_glora = False
+                rank = a2.shape[0]
+
+        if alpha is not None:
+            scale = alpha / rank
+        else:
+            scale = 1.0
+
+        # Apply multiplier
+        multiplier = getattr(self, "multiplier", 1.0)
+        scale = scale * multiplier
+
+        # Use module info from bypass injection, not input tensor shape
+        is_conv = getattr(self, "is_conv", False)
+        conv_dim = getattr(self, "conv_dim", 0)
+        kw_dict = getattr(self, "kw_dict", {})
+
+        if is_conv:
+            # Conv case - conv_dim is 1/2/3 for conv1d/2d/3d
+            conv_fn = (F.conv1d, F.conv2d, F.conv3d)[conv_dim - 1]
+
+            # Get module's stride/padding for spatial dimension handling
+            module_stride = kw_dict.get("stride", (1,) * conv_dim)
+            module_padding = kw_dict.get("padding", (0,) * conv_dim)
+            kernel_size = getattr(self, "kernel_size", (1,) * conv_dim)
+            in_channels = getattr(self, "in_channels", None)
+
+            # Ensure weights are in conv shape
+            # a1, a2, b1 are always 1x1 kernels
+            if a1.ndim == 2:
+                a1 = a1.view(*a1.shape, *([1] * conv_dim))
+            if a2.ndim == 2:
+                a2 = a2.view(*a2.shape, *([1] * conv_dim))
+            if b1.ndim == 2:
+                b1 = b1.view(*b1.shape, *([1] * conv_dim))
+            # b2 has actual kernel_size (like LoRA down)
+            if b2.ndim == 2:
+                if in_channels is not None:
+                    b2 = b2.view(b2.shape[0], in_channels, *kernel_size)
+                else:
+                    b2 = b2.view(*b2.shape, *([1] * conv_dim))
+
+            # A path: a2(x) -> a1(...) - 1x1 convs, no stride/padding needed, a_out is added to x
+            a2_out = conv_fn(x, a2)
+            a_out = conv_fn(a2_out, a1) * scale
+
+            # B path: b2(x) with kernel/stride/padding -> b1(...) 1x1
+            b2_out = conv_fn(x, b2, stride=module_stride, padding=module_padding)
+            b_out = conv_fn(b2_out, b1) * scale
+        else:
+            # Linear case
+            if old_glora:
+                # Old format: a1 @ a2 @ x, b2 @ b1
+                a_out = F.linear(F.linear(x, a2), a1) * scale
+                b_out = F.linear(F.linear(x, b1), b2) * scale
+            else:
+                # New format: x @ a1 @ a2, b1 @ b2
+                a_out = F.linear(F.linear(x, a1), a2) * scale
+                b_out = F.linear(F.linear(x, b2), b1) * scale
+
+        return a_out, b_out
+
+    def bypass_forward(
+        self,
+        org_forward: Callable,
+        x: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        GLoRA bypass forward: f(x + a(x)) + b(x)
+
+        Unlike standard adapters, GLoRA modifies the input to the base forward
+        AND adds the B path output.
+
+        Note:
+            Does not access original model weights - bypass mode is designed
+            for quantized models where weights may not be accessible.
+
+        Reference: LyCORIS GLoRAModule._bypass_forward
+        """
+        a_out, b_out = self._compute_paths(x)
+
+        # Call base forward with modified input
+        base_out = org_forward(x + a_out, *args, **kwargs)
+
+        # Add B path
+        return base_out + b_out
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        For GLoRA, h() returns the B path output.
+
+        Note:
+            GLoRA's full bypass requires overriding bypass_forward() since
+            it also modifies the input to org_forward. This h() is provided for
+            compatibility but bypass_forward() should be used for correct behavior.
+
+            Does not access original model weights - bypass mode is designed
+            for quantized models where weights may not be accessible.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward (unused, for API consistency)
+        """
+        _, b_out = self._compute_paths(x)
+        return b_out
@@ -1,11 +1,22 @@
 import logging
+from functools import cache
 from typing import Optional

 import torch
+import torch.nn.functional as F
 import comfy.model_management
 from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose


+@cache
+def _warn_loha_bypass_inefficient():
+    """One-time warning about LoHa bypass inefficiency."""
+    logging.warning(
+        "LoHa bypass mode is inefficient: full weight diff is computed each forward pass. "
+        "Consider using LoRA or LoKr for training with bypass mode."
+    )
+
+
 class HadaWeight(torch.autograd.Function):
    @staticmethod
    def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)):
@@ -105,9 +116,19 @@ class LohaDiff(WeightAdapterTrainBase):

        scale = self.alpha / self.rank
        if self.use_tucker:
-            diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale)
+            diff_weight = HadaWeightTucker.apply(
+                self.hada_t1,
+                self.hada_w1_a,
+                self.hada_w1_b,
+                self.hada_t2,
+                self.hada_w2_a,
+                self.hada_w2_b,
+                scale,
+            )
        else:
-            diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale)
+            diff_weight = HadaWeight.apply(
+                self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale
+            )

        # Add the scaled difference to the original weight
        weight = w.to(diff_weight) + diff_weight.reshape(w.shape)
@@ -138,9 +159,7 @@ class LoHaAdapter(WeightAdapterBase):
        mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32)
        torch.nn.init.normal_(mat3, 0.1)
        torch.nn.init.normal_(mat4, 0.01)
-        return LohaDiff(
-            (mat1, mat2, alpha, mat3, mat4, None, None, None)
-        )
+        return LohaDiff((mat1, mat2, alpha, mat3, mat4, None, None, None))

    def to_train(self):
        return LohaDiff(self.weights)
@@ -172,7 +191,16 @@ class LoHaAdapter(WeightAdapterBase):
                loaded_keys.add(hada_t1_name)
                loaded_keys.add(hada_t2_name)

-            weights = (lora[hada_w1_a_name], lora[hada_w1_b_name], alpha, lora[hada_w2_a_name], lora[hada_w2_b_name], hada_t1, hada_t2, dora_scale)
+            weights = (
+                lora[hada_w1_a_name],
+                lora[hada_w1_b_name],
+                alpha,
+                lora[hada_w2_a_name],
+                lora[hada_w2_b_name],
+                hada_t1,
+                hada_t2,
+                dora_scale,
+            )
            loaded_keys.add(hada_w1_a_name)
            loaded_keys.add(hada_w1_b_name)
            loaded_keys.add(hada_w2_a_name)
@@ -203,30 +231,148 @@ class LoHaAdapter(WeightAdapterBase):
        w2a = v[3]
        w2b = v[4]
        dora_scale = v[7]
-        if v[5] is not None: #cp decomposition
+        if v[5] is not None:  # cp decomposition
            t1 = v[5]
            t2 = v[6]
-            m1 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype))
+            m1 = torch.einsum(
+                "i j k l, j r, i p -> p r k l",
+                comfy.model_management.cast_to_device(
+                    t1, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w1b, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w1a, weight.device, intermediate_dtype
+                ),
+            )

-            m2 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype))
+            m2 = torch.einsum(
+                "i j k l, j r, i p -> p r k l",
+                comfy.model_management.cast_to_device(
+                    t2, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w2b, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w2a, weight.device, intermediate_dtype
+                ),
+            )
        else:
-            m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype),
-                            comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype))
-            m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype),
-                            comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype))
+            m1 = torch.mm(
+                comfy.model_management.cast_to_device(
+                    w1a, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w1b, weight.device, intermediate_dtype
+                ),
+            )
+            m2 = torch.mm(
+                comfy.model_management.cast_to_device(
+                    w2a, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w2b, weight.device, intermediate_dtype
+                ),
+            )

        try:
            lora_diff = (m1 * m2).reshape(weight.shape)
            if dora_scale is not None:
-                weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                weight = weight_decompose(
+                    dora_scale,
+                    weight,
+                    lora_diff,
+                    alpha,
+                    strength,
+                    intermediate_dtype,
+                    function,
+                )
            else:
                weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
        except Exception as e:
            logging.error("ERROR {} {} {}".format(self.name, key, e))
        return weight
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component for LoHa: h(x) = diff_weight @ x
+
+        WARNING: Inefficient - computes full Hadamard product each forward.
+
+        Note:
+            Does not access original model weights - bypass mode is designed
+            for quantized models where weights may not be accessible.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward (unused, for API consistency)
+
+        Reference: LyCORIS functional/loha.py bypass_forward_diff
+        """
+        _warn_loha_bypass_inefficient()
+
+        # FUNC_LIST: [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d]
+        FUNC_LIST = [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d]
+
+        v = self.weights
+        # v[0]=w1a, v[1]=w1b, v[2]=alpha, v[3]=w2a, v[4]=w2b, v[5]=t1, v[6]=t2, v[7]=dora
+        w1a = v[0]
+        w1b = v[1]
+        alpha = v[2]
+        w2a = v[3]
+        w2b = v[4]
+        t1 = v[5]
+        t2 = v[6]
+
+        # Compute scale
+        rank = w1b.shape[0]
+        scale = (alpha / rank if alpha is not None else 1.0) * getattr(
+            self, "multiplier", 1.0
+        )
+
+        # Cast dtype
+        w1a = w1a.to(dtype=x.dtype)
+        w1b = w1b.to(dtype=x.dtype)
+        w2a = w2a.to(dtype=x.dtype)
+        w2b = w2b.to(dtype=x.dtype)
+
+        # Use module info from bypass injection, not weight dimension
+        is_conv = getattr(self, "is_conv", False)
+        conv_dim = getattr(self, "conv_dim", 0)
+        kw_dict = getattr(self, "kw_dict", {})
+
+        # Compute diff weight using Hadamard product
+        if t1 is not None and t2 is not None:
+            t1 = t1.to(dtype=x.dtype)
+            t2 = t2.to(dtype=x.dtype)
+            m1 = torch.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a)
+            m2 = torch.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a)
+            diff_weight = (m1 * m2) * scale
+        else:
+            m1 = w1a @ w1b
+            m2 = w2a @ w2b
+            diff_weight = (m1 * m2) * scale
+
+        if is_conv:
+            op = FUNC_LIST[conv_dim + 2]
+            kernel_size = getattr(self, "kernel_size", (1,) * conv_dim)
+            in_channels = getattr(self, "in_channels", None)
+
+            # Reshape 2D diff_weight to conv format using kernel_size
+            # diff_weight: [out_channels, in_channels * prod(kernel_size)] -> [out_channels, in_channels, *kernel_size]
+            if diff_weight.dim() == 2:
+                if in_channels is not None:
+                    diff_weight = diff_weight.view(
+                        diff_weight.shape[0], in_channels, *kernel_size
+                    )
+                else:
+                    diff_weight = diff_weight.view(
+                        *diff_weight.shape, *([1] * conv_dim)
+                    )
+        else:
+            op = F.linear
+            kw_dict = {}
+
+        return op(x, diff_weight, **kw_dict)
@@ -2,6 +2,7 @@ import logging
 from typing import Optional

 import torch
+import torch.nn.functional as F
 import comfy.model_management
 from .base import (
    WeightAdapterBase,
@@ -14,7 +15,17 @@ from .base import (
 class LokrDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
-        (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights
+        (
+            lokr_w1,
+            lokr_w2,
+            alpha,
+            lokr_w1_a,
+            lokr_w1_b,
+            lokr_w2_a,
+            lokr_w2_b,
+            lokr_t2,
+            dora_scale,
+        ) = weights
        self.use_tucker = False
        if lokr_w1_a is not None:
            _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1]
@@ -57,10 +68,10 @@ class LokrDiff(WeightAdapterTrainBase):
        if self.w2_rebuild:
            if self.use_tucker:
                w2 = torch.einsum(
-                    'i j k l, j r, i p -> p r k l',
+                    "i j k l, j r, i p -> p r k l",
                    self.lokr_t2,
                    self.lokr_w2_b,
-                    self.lokr_w2_a
+                    self.lokr_w2_a,
                )
            else:
                w2 = self.lokr_w2_a @ self.lokr_w2_b
@@ -69,9 +80,89 @@ class LokrDiff(WeightAdapterTrainBase):
            return self.lokr_w2

    def __call__(self, w):
-        diff = torch.kron(self.w1, self.w2)
+        w1 = self.w1
+        w2 = self.w2
+        # Unsqueeze w1 to match w2 dims for proper kron product (like LyCORIS make_kron)
+        for _ in range(w2.dim() - w1.dim()):
+            w1 = w1.unsqueeze(-1)
+        diff = torch.kron(w1, w2)
        return w + diff.reshape(w.shape).to(w)

+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component for LoKr training: efficient Kronecker product.
+
+        Uses w1/w2 properties which handle both direct and decomposed cases.
+        For create_train (direct w1/w2), no alpha scaling in properties.
+        For to_train (decomposed), alpha/rank scaling is in properties.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward (unused, for API consistency)
+        """
+        # Get w1, w2 from properties (handles rebuild vs direct)
+        w1 = self.w1
+        w2 = self.w2
+
+        # Multiplier from bypass injection
+        multiplier = getattr(self, "multiplier", 1.0)
+
+        # Get module info from bypass injection
+        is_conv = getattr(self, "is_conv", False)
+        conv_dim = getattr(self, "conv_dim", 0)
+        kw_dict = getattr(self, "kw_dict", {})
+
+        # Efficient Kronecker application without materializing full weight
+        # kron(w1, w2) @ x can be computed as nested operations
+        # w1: [out_l, in_m], w2: [out_k, in_n, *k_size]
+        # Full weight would be [out_l*out_k, in_m*in_n, *k_size]
+
+        uq = w1.size(1)  # in_m - inner grouping dimension
+
+        if is_conv:
+            conv_fn = (F.conv1d, F.conv2d, F.conv3d)[conv_dim - 1]
+
+            B, C_in, *spatial = x.shape
+            # Reshape input for grouped application: [B * uq, C_in // uq, *spatial]
+            h_in_group = x.reshape(B * uq, -1, *spatial)
+
+            # Ensure w2 has conv dims
+            if w2.dim() == 2:
+                w2 = w2.view(*w2.shape, *([1] * conv_dim))
+
+            # Apply w2 path with stride/padding
+            hb = conv_fn(h_in_group, w2, **kw_dict)
+
+            # Reshape for cross-group operation
+            hb = hb.view(B, -1, *hb.shape[1:])
+            h_cross = hb.transpose(1, -1)
+
+            # Apply w1 (always 2D, applied as linear on channel dim)
+            hc = F.linear(h_cross, w1)
+            hc = hc.transpose(1, -1)
+
+            # Reshape to output
+            out = hc.reshape(B, -1, *hc.shape[3:])
+        else:
+            # Linear case
+            # Reshape input: [..., in_m * in_n] -> [..., uq (in_m), in_n]
+            h_in_group = x.reshape(*x.shape[:-1], uq, -1)
+
+            # Apply w2: [..., uq, in_n] @ [out_k, in_n].T -> [..., uq, out_k]
+            hb = F.linear(h_in_group, w2)
+
+            # Transpose for w1: [..., uq, out_k] -> [..., out_k, uq]
+            h_cross = hb.transpose(-1, -2)
+
+            # Apply w1: [..., out_k, uq] @ [out_l, uq].T -> [..., out_k, out_l]
+            hc = F.linear(h_cross, w1)
+
+            # Transpose back and flatten: [..., out_k, out_l] -> [..., out_l * out_k]
+            hc = hc.transpose(-1, -2)
+            out = hc.reshape(*hc.shape[:-2], -1)
+
+        return out * multiplier
+
    def passive_memory_usage(self):
        return sum(param.numel() * param.element_size() for param in self.parameters())

@@ -86,16 +177,22 @@ class LoKrAdapter(WeightAdapterBase):
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
-        in_dim = weight.shape[1:].numel()
-        out1, out2 = factorization(out_dim, rank)
-        in1, in2 = factorization(in_dim, rank)
-        mat1 = torch.empty(out1, in1, device=weight.device, dtype=torch.float32)
-        mat2 = torch.empty(out2, in2, device=weight.device, dtype=torch.float32)
+        in_dim = weight.shape[1]  # Just in_channels, not flattened with kernel
+        k_size = weight.shape[2:] if weight.dim() > 2 else ()
+
+        out_l, out_k = factorization(out_dim, rank)
+        in_m, in_n = factorization(in_dim, rank)
+
+        # w1: [out_l, in_m]
+        mat1 = torch.empty(out_l, in_m, device=weight.device, dtype=torch.float32)
+        # w2: [out_k, in_n, *k_size] for conv, [out_k, in_n] for linear
+        mat2 = torch.empty(
+            out_k, in_n, *k_size, device=weight.device, dtype=torch.float32
+        )
+
        torch.nn.init.kaiming_uniform_(mat2, a=5**0.5)
        torch.nn.init.constant_(mat1, 0.0)
-        return LokrDiff(
-            (mat1, mat2, alpha, None, None, None, None, None, None)
-        )
+        return LokrDiff((mat1, mat2, alpha, None, None, None, None, None, None))

    def to_train(self):
        return LokrDiff(self.weights)
@@ -154,8 +251,23 @@ class LoKrAdapter(WeightAdapterBase):
            lokr_t2 = lora[lokr_t2_name]
            loaded_keys.add(lokr_t2_name)

-        if (lokr_w1 is not None) or (lokr_w2 is not None) or (lokr_w1_a is not None) or (lokr_w2_a is not None):
-            weights = (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale)
+        if (
+            (lokr_w1 is not None)
+            or (lokr_w2 is not None)
+            or (lokr_w1_a is not None)
+            or (lokr_w2_a is not None)
+        ):
+            weights = (
+                lokr_w1,
+                lokr_w2,
+                alpha,
+                lokr_w1_a,
+                lokr_w1_b,
+                lokr_w2_a,
+                lokr_w2_b,
+                lokr_t2,
+                dora_scale,
+            )
            return cls(loaded_keys, weights)
        else:
            return None
@@ -184,23 +296,47 @@ class LoKrAdapter(WeightAdapterBase):

        if w1 is None:
            dim = w1_b.shape[0]
-            w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype),
-                            comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype))
+            w1 = torch.mm(
+                comfy.model_management.cast_to_device(
+                    w1_a, weight.device, intermediate_dtype
+                ),
+                comfy.model_management.cast_to_device(
+                    w1_b, weight.device, intermediate_dtype
+                ),
+            )
        else:
-            w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype)
+            w1 = comfy.model_management.cast_to_device(
+                w1, weight.device, intermediate_dtype
+            )

        if w2 is None:
            dim = w2_b.shape[0]
            if t2 is None:
-                w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype))
+                w2 = torch.mm(
+                    comfy.model_management.cast_to_device(
+                        w2_a, weight.device, intermediate_dtype
+                    ),
+                    comfy.model_management.cast_to_device(
+                        w2_b, weight.device, intermediate_dtype
+                    ),
+                )
            else:
-                w2 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                    comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype))
+                w2 = torch.einsum(
+                    "i j k l, j r, i p -> p r k l",
+                    comfy.model_management.cast_to_device(
+                        t2, weight.device, intermediate_dtype
+                    ),
+                    comfy.model_management.cast_to_device(
+                        w2_b, weight.device, intermediate_dtype
+                    ),
+                    comfy.model_management.cast_to_device(
+                        w2_a, weight.device, intermediate_dtype
+                    ),
+                )
        else:
-            w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype)
+            w2 = comfy.model_management.cast_to_device(
+                w2, weight.device, intermediate_dtype
+            )

        if len(w2.shape) == 4:
            w1 = w1.unsqueeze(2).unsqueeze(2)
@@ -212,9 +348,134 @@ class LoKrAdapter(WeightAdapterBase):
        try:
            lora_diff = torch.kron(w1, w2).reshape(weight.shape)
            if dora_scale is not None:
-                weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                weight = weight_decompose(
+                    dora_scale,
+                    weight,
+                    lora_diff,
+                    alpha,
+                    strength,
+                    intermediate_dtype,
+                    function,
+                )
            else:
                weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
        except Exception as e:
            logging.error("ERROR {} {} {}".format(self.name, key, e))
        return weight
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component for LoKr: efficient Kronecker product application.
+
+        Note:
+            Does not access original model weights - bypass mode is designed
+            for quantized models where weights may not be accessible.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward (unused, for API consistency)
+
+        Reference: LyCORIS functional/lokr.py bypass_forward_diff
+        """
+        # FUNC_LIST: [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d]
+        FUNC_LIST = [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d]
+
+        v = self.weights
+        # v[0]=w1, v[1]=w2, v[2]=alpha, v[3]=w1_a, v[4]=w1_b, v[5]=w2_a, v[6]=w2_b, v[7]=t2, v[8]=dora
+        w1 = v[0]
+        w2 = v[1]
+        alpha = v[2]
+        w1_a = v[3]
+        w1_b = v[4]
+        w2_a = v[5]
+        w2_b = v[6]
+        t2 = v[7]
+
+        use_w1 = w1 is not None
+        use_w2 = w2 is not None
+        tucker = t2 is not None
+
+        # Use module info from bypass injection, not weight dimension
+        is_conv = getattr(self, "is_conv", False)
+        conv_dim = getattr(self, "conv_dim", 0)
+        kw_dict = getattr(self, "kw_dict", {}) if is_conv else {}
+
+        if is_conv:
+            op = FUNC_LIST[conv_dim + 2]
+        else:
+            op = F.linear
+
+        # Determine rank and scale
+        rank = w1_b.size(0) if not use_w1 else w2_b.size(0) if not use_w2 else alpha
+        scale = (alpha / rank if alpha is not None else 1.0) * getattr(
+            self, "multiplier", 1.0
+        )
+
+        # Build c (w1)
+        if use_w1:
+            c = w1.to(dtype=x.dtype)
+        else:
+            c = w1_a.to(dtype=x.dtype) @ w1_b.to(dtype=x.dtype)
+        uq = c.size(1)
+
+        # Build w2 components
+        if use_w2:
+            ba = w2.to(dtype=x.dtype)
+        else:
+            a = w2_b.to(dtype=x.dtype)
+            b = w2_a.to(dtype=x.dtype)
+            if is_conv:
+                if tucker:
+                    # Tucker: a, b get 1s appended (kernel is in t2)
+                    if a.dim() == 2:
+                        a = a.view(*a.shape, *([1] * conv_dim))
+                    if b.dim() == 2:
+                        b = b.view(*b.shape, *([1] * conv_dim))
+                else:
+                    # Non-tucker conv: b may need 1s appended
+                    if b.dim() == 2:
+                        b = b.view(*b.shape, *([1] * conv_dim))
+
+        # Reshape input by uq groups
+        if is_conv:
+            B, _, *rest = x.shape
+            h_in_group = x.reshape(B * uq, -1, *rest)
+        else:
+            h_in_group = x.reshape(*x.shape[:-1], uq, -1)
+
+        # Apply w2 path
+        if use_w2:
+            hb = op(h_in_group, ba, **kw_dict)
+        else:
+            if is_conv:
+                if tucker:
+                    t = t2.to(dtype=x.dtype)
+                    if t.dim() == 2:
+                        t = t.view(*t.shape, *([1] * conv_dim))
+                    ha = op(h_in_group, a)
+                    ht = op(ha, t, **kw_dict)
+                    hb = op(ht, b)
+                else:
+                    ha = op(h_in_group, a, **kw_dict)
+                    hb = op(ha, b)
+            else:
+                ha = op(h_in_group, a)
+                hb = op(ha, b)
+
+        # Reshape and apply c (w1)
+        if is_conv:
+            hb = hb.view(B, -1, *hb.shape[1:])
+            h_cross_group = hb.transpose(1, -1)
+        else:
+            h_cross_group = hb.transpose(-1, -2)
+
+        hc = F.linear(h_cross_group, c)
+
+        if is_conv:
+            hc = hc.transpose(1, -1)
+            out = hc.reshape(B, -1, *hc.shape[3:])
+        else:
+            hc = hc.transpose(-1, -2)
+            out = hc.reshape(*hc.shape[:-2], -1)
+
+        return out * scale
@@ -2,6 +2,7 @@ import logging
 from typing import Optional

 import torch
+import torch.nn.functional as F
 import comfy.model_management
 from .base import (
    WeightAdapterBase,
@@ -20,11 +21,7 @@ class LoraDiff(WeightAdapterTrainBase):
        rank, in_dim = mat2.shape[0], mat2.shape[1]
        if mid is not None:
            convdim = mid.ndim - 2
-            layer = (
-                torch.nn.Conv1d,
-                torch.nn.Conv2d,
-                torch.nn.Conv3d
-            )[convdim]
+            layer = (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d)[convdim]
        else:
            layer = torch.nn.Linear
        self.lora_up = layer(rank, out_dim, bias=False)
@@ -51,6 +48,78 @@ class LoraDiff(WeightAdapterTrainBase):
        weight = w + scale * diff.reshape(w.shape)
        return weight.to(org_dtype)

+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component for LoRA training: h(x) = up(down(x)) * scale
+
+        Simple implementation using the nn.Module weights directly.
+        No mid/dora/reshape branches (create_train doesn't create them).
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward (unused, for API consistency)
+        """
+        # Compute scale = alpha / rank * multiplier
+        scale = (self.alpha / self.rank) * getattr(self, "multiplier", 1.0)
+
+        # Get module info from bypass injection
+        is_conv = getattr(self, "is_conv", False)
+        conv_dim = getattr(self, "conv_dim", 0)
+        kw_dict = getattr(self, "kw_dict", {})
+
+        # Get weights (keep in original dtype for numerical stability)
+        down_weight = self.lora_down.weight
+        up_weight = self.lora_up.weight
+
+        if is_conv:
+            # Conv path: use functional conv
+            # conv_dim: 1=conv1d, 2=conv2d, 3=conv3d
+            conv_fn = (F.conv1d, F.conv2d, F.conv3d)[conv_dim - 1]
+
+            # Reshape 2D weights to conv format if needed
+            # down: [rank, in_features] -> [rank, in_channels, *kernel_size]
+            # up: [out_features, rank] -> [out_features, rank, 1, 1, ...]
+            if down_weight.dim() == 2:
+                kernel_size = getattr(self, "kernel_size", (1,) * conv_dim)
+                in_channels = getattr(self, "in_channels", None)
+                if in_channels is not None:
+                    down_weight = down_weight.view(
+                        down_weight.shape[0], in_channels, *kernel_size
+                    )
+                else:
+                    # Fallback: assume 1x1 kernel
+                    down_weight = down_weight.view(
+                        *down_weight.shape, *([1] * conv_dim)
+                    )
+            if up_weight.dim() == 2:
+                # up always uses 1x1 kernel
+                up_weight = up_weight.view(*up_weight.shape, *([1] * conv_dim))
+
+            # down conv uses stride/padding from module, up is 1x1
+            hidden = conv_fn(x, down_weight, **kw_dict)
+
+            # mid layer if exists (tucker decomposition)
+            if self.lora_mid is not None:
+                mid_weight = self.lora_mid.weight
+                if mid_weight.dim() == 2:
+                    mid_weight = mid_weight.view(*mid_weight.shape, *([1] * conv_dim))
+                hidden = conv_fn(hidden, mid_weight)
+
+            # up conv is always 1x1 (no stride/padding)
+            out = conv_fn(hidden, up_weight)
+        else:
+            # Linear path: simple matmul chain
+            hidden = F.linear(x, down_weight)
+
+            # mid layer if exists
+            if self.lora_mid is not None:
+                mid_weight = self.lora_mid.weight
+                hidden = F.linear(hidden, mid_weight)
+
+            out = F.linear(hidden, up_weight)
+
+        return out * scale
+
    def passive_memory_usage(self):
        return sum(param.numel() * param.element_size() for param in self.parameters())

@@ -70,9 +139,7 @@ class LoRAAdapter(WeightAdapterBase):
        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32)
        torch.nn.init.kaiming_uniform_(mat1, a=5**0.5)
        torch.nn.init.constant_(mat2, 0.0)
-        return LoraDiff(
-            (mat1, mat2, alpha, None, None, None)
-        )
+        return LoraDiff((mat1, mat2, alpha, None, None, None))

    def to_train(self):
        return LoraDiff(self.weights)
@@ -210,3 +277,85 @@ class LoRAAdapter(WeightAdapterBase):
        except Exception as e:
            logging.error("ERROR {} {} {}".format(self.name, key, e))
        return weight
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        Additive bypass component for LoRA: h(x) = up(down(x)) * scale
+
+        Note:
+            Does not access original model weights - bypass mode is designed
+            for quantized models where weights may not be accessible.
+
+        Args:
+            x: Input tensor
+            base_out: Output from base forward (unused, for API consistency)
+
+        Reference: LyCORIS functional/locon.py bypass_forward_diff
+        """
+        # FUNC_LIST: [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d]
+        FUNC_LIST = [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d]
+
+        v = self.weights
+        # v[0]=up, v[1]=down, v[2]=alpha, v[3]=mid, v[4]=dora_scale, v[5]=reshape
+        up = v[0]
+        down = v[1]
+        alpha = v[2]
+        mid = v[3]
+
+        # Compute scale = alpha / rank
+        rank = down.shape[0]
+        if alpha is not None:
+            scale = alpha / rank
+        else:
+            scale = 1.0
+        scale = scale * getattr(self, "multiplier", 1.0)
+
+        # Cast dtype
+        up = up.to(dtype=x.dtype)
+        down = down.to(dtype=x.dtype)
+
+        # Use module info from bypass injection, not weight dimension
+        is_conv = getattr(self, "is_conv", False)
+        conv_dim = getattr(self, "conv_dim", 0)
+        kw_dict = getattr(self, "kw_dict", {})
+
+        if is_conv:
+            op = FUNC_LIST[
+                conv_dim + 2
+            ]  # conv_dim 1->conv1d(3), 2->conv2d(4), 3->conv3d(5)
+            kernel_size = getattr(self, "kernel_size", (1,) * conv_dim)
+            in_channels = getattr(self, "in_channels", None)
+
+            # Reshape 2D weights to conv format using kernel_size
+            # down: [rank, in_channels * prod(kernel_size)] -> [rank, in_channels, *kernel_size]
+            # up: [out_channels, rank] -> [out_channels, rank, 1, 1, ...] (1x1 kernel)
+            if down.dim() == 2:
+                # down.shape[1] = in_channels * prod(kernel_size)
+                if in_channels is not None:
+                    down = down.view(down.shape[0], in_channels, *kernel_size)
+                else:
+                    # Fallback: assume 1x1 kernel if in_channels unknown
+                    down = down.view(*down.shape, *([1] * conv_dim))
+            if up.dim() == 2:
+                # up always uses 1x1 kernel
+                up = up.view(*up.shape, *([1] * conv_dim))
+            if mid is not None:
+                mid = mid.to(dtype=x.dtype)
+                if mid.dim() == 2:
+                    mid = mid.view(*mid.shape, *([1] * conv_dim))
+        else:
+            op = F.linear
+            kw_dict = {}  # linear doesn't take stride/padding
+
+        # Simple chain: down -> mid (if tucker) -> up
+        if mid is not None:
+            if not is_conv:
+                mid = mid.to(dtype=x.dtype)
+            hidden = op(x, down)
+            hidden = op(hidden, mid, **kw_dict)
+            out = op(hidden, up)
+        else:
+            hidden = op(x, down, **kw_dict)
+            out = op(hidden, up)
+
+        return out * scale
@@ -3,13 +3,18 @@ from typing import Optional

 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization
+from .base import (
+    WeightAdapterBase,
+    WeightAdapterTrainBase,
+    weight_decompose,
+    factorization,
+)


 class OFTDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
-        # Unpack weights tuple from LoHaAdapter
+        # Unpack weights tuple from OFTAdapter
        blocks, rescale, alpha, _ = weights

        # Create trainable parameters
@@ -52,6 +57,78 @@ class OFTDiff(WeightAdapterTrainBase):
            weight = self.rescale * weight
        return weight.to(org_dtype)

+    def _get_orthogonal_matrix(self, device, dtype):
+        """Compute the orthogonal rotation matrix R from OFT blocks."""
+        blocks = self.oft_blocks.to(device=device, dtype=dtype)
+        I = torch.eye(self.block_size, device=device, dtype=dtype)
+
+        # Q = blocks - blocks^T (skew-symmetric)
+        q = blocks - blocks.transpose(1, 2)
+        normed_q = q
+
+        # Apply constraint if set
+        if self.constraint:
+            q_norm = torch.norm(q) + 1e-8
+            if q_norm > self.constraint:
+                normed_q = q * self.constraint / q_norm
+
+        # Cayley transform: R = (I + Q)(I - Q)^-1
+        r = (I + normed_q) @ (I - normed_q).float().inverse()
+        return r.to(dtype)
+
+    def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor:
+        """
+        OFT has no additive component - returns zeros matching base_out shape.
+
+        OFT only transforms the output via g(), it doesn't add to it.
+        """
+        return torch.zeros_like(base_out)
+
+    def g(self, y: torch.Tensor) -> torch.Tensor:
+        """
+        Output transformation for OFT: applies orthogonal rotation.
+
+        OFT transforms output channels using block-diagonal orthogonal matrices.
+        """
+        r = self._get_orthogonal_matrix(y.device, y.dtype)
+
+        # Apply multiplier to interpolate between identity and full transform
+        multiplier = getattr(self, "multiplier", 1.0)
+        I = torch.eye(self.block_size, device=y.device, dtype=y.dtype)
+        r = r * multiplier + (1 - multiplier) * I
+
+        # Use module info from bypass injection
+        is_conv = getattr(self, "is_conv", y.dim() > 2)
+
+        if is_conv:
+            # Conv output: (N, C, H, W, ...) -> transpose to (N, H, W, ..., C)
+            y = y.transpose(1, -1)
+
+        # y now has channels in last dim
+        *batch_shape, out_features = y.shape
+
+        # Reshape to apply block-diagonal transform
+        # (*, out_features) -> (*, block_num, block_size)
+        y_blocked = y.reshape(*batch_shape, self.block_num, self.block_size)
+
+        # Apply orthogonal transform: R @ y for each block
+        # r: (block_num, block_size, block_size), y_blocked: (*, block_num, block_size)
+        out_blocked = torch.einsum("k n m, ... k n -> ... k m", r, y_blocked)
+
+        # Reshape back: (*, block_num, block_size) -> (*, out_features)
+        out = out_blocked.reshape(*batch_shape, out_features)
+
+        # Apply rescale if present
+        if self.rescaled:
+            rescale = self.rescale.to(device=y.device, dtype=y.dtype)
+            out = out * rescale.view(-1)
+
+        if is_conv:
+            # Transpose back: (N, H, W, ..., C) -> (N, C, H, W, ...)
+            out = out.transpose(1, -1)
+
+        return out
+
    def passive_memory_usage(self):
        """Calculates memory usage of the trainable parameters."""
        return sum(param.numel() * param.element_size() for param in self.parameters())
@@ -68,10 +145,10 @@ class OFTAdapter(WeightAdapterBase):
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        block_size, block_num = factorization(out_dim, rank)
-        block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=torch.float32)
-        return OFTDiff(
-            (block, None, alpha, None)
+        block = torch.zeros(
+            block_num, block_size, block_size, device=weight.device, dtype=torch.float32
        )
+        return OFTDiff((block, None, alpha, None))

    def to_train(self):
        return OFTDiff(self.weights)
@@ -127,9 +204,13 @@ class OFTAdapter(WeightAdapterBase):
            alpha = 0
        dora_scale = v[3]

-        blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype)
+        blocks = comfy.model_management.cast_to_device(
+            blocks, weight.device, intermediate_dtype
+        )
        if rescale is not None:
-            rescale = comfy.model_management.cast_to_device(rescale, weight.device, intermediate_dtype)
+            rescale = comfy.model_management.cast_to_device(
+                rescale, weight.device, intermediate_dtype
+            )

        block_num, block_size, *_ = blocks.shape

@@ -139,23 +220,108 @@ class OFTAdapter(WeightAdapterBase):
            # for Q = -Q^T
            q = blocks - blocks.transpose(1, 2)
            normed_q = q
-            if alpha > 0: # alpha in oft/boft is for constraint
+            if alpha > 0:  # alpha in oft/boft is for constraint
                q_norm = torch.norm(q) + 1e-8
                if q_norm > alpha:
                    normed_q = q * alpha / q_norm
            # use float() to prevent unsupported type in .inverse()
            r = (I + normed_q) @ (I - normed_q).float().inverse()
            r = r.to(weight)
+            # Create I in weight's dtype for the einsum
+            I_w = torch.eye(block_size, device=weight.device, dtype=weight.dtype)
            _, *shape = weight.shape
            lora_diff = torch.einsum(
                "k n m, k n ... -> k m ...",
-                (r * strength) - strength * I,
+                (r * strength) - strength * I_w,
                weight.view(block_num, block_size, *shape),
            ).view(-1, *shape)
            if dora_scale is not None:
-                weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                weight = weight_decompose(
+                    dora_scale,
+                    weight,
+                    lora_diff,
+                    alpha,
+                    strength,
+                    intermediate_dtype,
+                    function,
+                )
            else:
                weight += function((strength * lora_diff).type(weight.dtype))
        except Exception as e:
            logging.error("ERROR {} {} {}".format(self.name, key, e))
        return weight
+
+    def _get_orthogonal_matrix(self, device, dtype):
+        """Compute the orthogonal rotation matrix R from OFT blocks."""
+        v = self.weights
+        blocks = v[0].to(device=device, dtype=dtype)
+        alpha = v[2]
+        if alpha is None:
+            alpha = 0
+
+        block_num, block_size, _ = blocks.shape
+        I = torch.eye(block_size, device=device, dtype=dtype)
+
+        # Q = blocks - blocks^T (skew-symmetric)
+        q = blocks - blocks.transpose(1, 2)
+        normed_q = q
+
+        # Apply constraint if alpha > 0
+        if alpha > 0:
+            q_norm = torch.norm(q) + 1e-8
+            if q_norm > alpha:
+                normed_q = q * alpha / q_norm
+
+        # Cayley transform: R = (I + Q)(I - Q)^-1
+        r = (I + normed_q) @ (I - normed_q).float().inverse()
+        return r, block_num, block_size
+
+    def g(self, y: torch.Tensor) -> torch.Tensor:
+        """
+        Output transformation for OFT: applies orthogonal rotation to output.
+
+        OFT transforms the output channels using block-diagonal orthogonal matrices.
+
+        Reference: LyCORIS DiagOFTModule._bypass_forward
+        """
+        v = self.weights
+        rescale = v[1]
+
+        r, block_num, block_size = self._get_orthogonal_matrix(y.device, y.dtype)
+
+        # Apply multiplier to interpolate between identity and full transform
+        multiplier = getattr(self, "multiplier", 1.0)
+        I = torch.eye(block_size, device=y.device, dtype=y.dtype)
+        r = r * multiplier + (1 - multiplier) * I
+
+        # Use module info from bypass injection to determine conv vs linear
+        is_conv = getattr(self, "is_conv", y.dim() > 2)
+
+        if is_conv:
+            # Conv output: (N, C, H, W, ...) -> transpose to (N, H, W, ..., C)
+            y = y.transpose(1, -1)
+
+        # y now has channels in last dim
+        *batch_shape, out_features = y.shape
+
+        # Reshape to apply block-diagonal transform
+        # (*, out_features) -> (*, block_num, block_size)
+        y_blocked = y.view(*batch_shape, block_num, block_size)
+
+        # Apply orthogonal transform: R @ y for each block
+        # r: (block_num, block_size, block_size), y_blocked: (*, block_num, block_size)
+        out_blocked = torch.einsum("k n m, ... k n -> ... k m", r, y_blocked)
+
+        # Reshape back: (*, block_num, block_size) -> (*, out_features)
+        out = out_blocked.view(*batch_shape, out_features)
+
+        # Apply rescale if present
+        if rescale is not None:
+            rescale = rescale.to(device=y.device, dtype=y.dtype)
+            out = out * rescale.view(-1)
+
+        if is_conv:
+            # Transpose back: (N, H, W, ..., C) -> (N, C, H, W, ...)
+            out = out.transpose(1, -1)
+
+        return out
@@ -0,0 +1,52 @@
+import ctypes
+import logging
+import psutil
+from ctypes import wintypes
+
+import comfy_aimdo.control
+
+psapi = ctypes.WinDLL("psapi")
+kernel32 = ctypes.WinDLL("kernel32")
+
+class PERFORMANCE_INFORMATION(ctypes.Structure):
+    _fields_ = [
+        ("cb", wintypes.DWORD),
+        ("CommitTotal", ctypes.c_size_t),
+        ("CommitLimit", ctypes.c_size_t),
+        ("CommitPeak", ctypes.c_size_t),
+        ("PhysicalTotal", ctypes.c_size_t),
+        ("PhysicalAvailable", ctypes.c_size_t),
+        ("SystemCache", ctypes.c_size_t),
+        ("KernelTotal", ctypes.c_size_t),
+        ("KernelPaged", ctypes.c_size_t),
+        ("KernelNonpaged", ctypes.c_size_t),
+        ("PageSize", ctypes.c_size_t),
+        ("HandleCount", wintypes.DWORD),
+        ("ProcessCount", wintypes.DWORD),
+        ("ThreadCount", wintypes.DWORD),
+    ]
+
+def get_free_ram():
+    #Windows is way too conservative and chalks recently used uncommitted model RAM
+    #as "in-use". So, calculate free RAM for the sake of general use as the greater of:
+    #
+    #1: What psutil says
+    #2: Total Memory - (Committed Memory - VRAM in use)
+    #
+    #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked
+    #commit charge for all VRAM used just incase it wants to page it all out. This just
+    #isn't realistic so "overcommit" on our calculations by just subtracting it off.
+
+    pi = PERFORMANCE_INFORMATION()
+    pi.cb = ctypes.sizeof(pi)
+
+    if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb):
+        logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal")
+        return psutil.virtual_memory().available
+
+    committed = pi.CommitTotal * pi.PageSize
+    total = pi.PhysicalTotal * pi.PageSize
+
+    return max(psutil.virtual_memory().available,
+               total - (committed - comfy_aimdo.control.get_total_vram_usage()))
+
@@ -7,7 +7,7 @@ from comfy_api.internal.singleton import ProxiedSingleton
 from comfy_api.internal.async_to_sync import create_sync_class
 from ._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
 from ._input_impl import VideoFromFile, VideoFromComponents
-from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
+from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL, File3D
 from . import _io_public as io
 from . import _ui_public as ui
 from comfy_execution.utils import get_executing_context
@@ -105,6 +105,7 @@ class Types:
    VideoComponents = VideoComponents
    MESH = MESH
    VOXEL = VOXEL
+    File3D = File3D

 ComfyAPI = ComfyAPI_latest

@@ -27,7 +27,7 @@ if TYPE_CHECKING:
 from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class,
    prune_dict, shallow_clone_class)
 from comfy_execution.graph_utils import ExecutionBlocker
-from ._util import MESH, VOXEL, SVG as _SVG
+from ._util import MESH, VOXEL, SVG as _SVG, File3D


 class FolderType(str, Enum):
@@ -667,6 +667,49 @@ class Voxel(ComfyTypeIO):
 class Mesh(ComfyTypeIO):
    Type = MESH

+
+@comfytype(io_type="FILE_3D")
+class File3DAny(ComfyTypeIO):
+    """General 3D file type - accepts any supported 3D format."""
+    Type = File3D
+
+
+@comfytype(io_type="FILE_3D_GLB")
+class File3DGLB(ComfyTypeIO):
+    """GLB format 3D file - binary glTF, best for web and cross-platform."""
+    Type = File3D
+
+
+@comfytype(io_type="FILE_3D_GLTF")
+class File3DGLTF(ComfyTypeIO):
+    """GLTF format 3D file - JSON-based glTF with external resources."""
+    Type = File3D
+
+
+@comfytype(io_type="FILE_3D_FBX")
+class File3DFBX(ComfyTypeIO):
+    """FBX format 3D file - best for game engines and animation."""
+    Type = File3D
+
+
+@comfytype(io_type="FILE_3D_OBJ")
+class File3DOBJ(ComfyTypeIO):
+    """OBJ format 3D file - simple geometry format."""
+    Type = File3D
+
+
+@comfytype(io_type="FILE_3D_STL")
+class File3DSTL(ComfyTypeIO):
+    """STL format 3D file - best for 3D printing."""
+    Type = File3D
+
+
+@comfytype(io_type="FILE_3D_USDZ")
+class File3DUSDZ(ComfyTypeIO):
+    """USDZ format 3D file - Apple AR format."""
+    Type = File3D
+
+
@comfytype(io_type="HOOKS")
 class Hooks(ComfyTypeIO):
    if TYPE_CHECKING:
@@ -754,7 +797,7 @@ class AnyType(ComfyTypeIO):
    Type = Any

@comfytype(io_type="MODEL_PATCH")
-class MODEL_PATCH(ComfyTypeIO):
+class ModelPatch(ComfyTypeIO):
    Type = Any

@comfytype(io_type="AUDIO_ENCODER")
@@ -1146,6 +1189,20 @@ class ImageCompare(ComfyTypeI):
      def as_dict(self):
          return super().as_dict()

+
+@comfytype(io_type="COLOR")
+class Color(ComfyTypeIO):
+  Type = str
+
+  class Input(WidgetInput):
+      def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                   socketless: bool=True, advanced: bool=None, default: str="#ffffff"):
+          super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+          self.default: str
+
+      def as_dict(self):
+          return super().as_dict()
+
 DYNAMIC_INPUT_LOOKUP: dict[str, Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]] = {}
 def register_dynamic_input_func(io_type: str, func: Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]):
    DYNAMIC_INPUT_LOOKUP[io_type] = func
@@ -1234,6 +1291,7 @@ class Hidden(str, Enum):
 class NodeInfoV1:
    input: dict=None
    input_order: dict[str, list[str]]=None
+    is_input_list: bool=None
    output: list[str]=None
    output_is_list: list[bool]=None
    output_name: list[str]=None
@@ -1247,24 +1305,10 @@ class NodeInfoV1:
    output_node: bool=None
    deprecated: bool=None
    experimental: bool=None
+    dev_only: bool=None
    api_node: bool=None
    price_badge: dict | None = None
-
-@dataclass
-class NodeInfoV3:
-    input: dict=None
-    output: dict=None
-    hidden: list[str]=None
-    name: str=None
-    display_name: str=None
-    description: str=None
-    python_module: Any = None
-    category: str=None
-    output_node: bool=None
-    deprecated: bool=None
-    experimental: bool=None
-    api_node: bool=None
-    price_badge: dict | None = None
+    search_aliases: list[str]=None


@dataclass
@@ -1346,6 +1390,8 @@ class Schema:
    hidden: list[Hidden] = field(default_factory=list)
    description: str=""
    """Node description, shown as a tooltip when hovering over the node."""
+    search_aliases: list[str] = field(default_factory=list)
+    """Alternative names for search. Useful for synonyms, abbreviations, or old names after renaming."""
    is_input_list: bool = False
    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.

@@ -1372,6 +1418,8 @@ class Schema:
    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
    is_experimental: bool=False
    """Flags a node as experimental, informing users that it may change or not work as expected."""
+    is_dev_only: bool=False
+    """Flags a node as dev-only, hiding it from search/menus unless dev mode is enabled."""
    is_api_node: bool=False
    """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview."""
    price_badge: PriceBadge | None = None
@@ -1380,6 +1428,8 @@ class Schema:
    """Flags a node as not idempotent; when True, the node will run and not reuse the cached outputs when identical inputs are provided on a different node in the graph."""
    enable_expand: bool=False
    """Flags a node as expandable, allowing NodeOutput to include 'expand' property."""
+    accept_all_inputs: bool=False
+    """When True, all inputs from the prompt will be passed to the node as kwargs, even if not defined in the schema."""

    def validate(self):
        '''Validate the schema:
@@ -1468,6 +1518,7 @@ class Schema:
        info = NodeInfoV1(
            input=input,
            input_order={key: list(value.keys()) for (key, value) in input.items()},
+            is_input_list=self.is_input_list,
            output=output,
            output_is_list=output_is_list,
            output_name=output_name,
@@ -1480,42 +1531,11 @@ class Schema:
            output_node=self.is_output_node,
            deprecated=self.is_deprecated,
            experimental=self.is_experimental,
+            dev_only=self.is_dev_only,
            api_node=self.is_api_node,
            python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes"),
            price_badge=self.price_badge.as_dict(self.inputs) if self.price_badge is not None else None,
-        )
-        return info
-
-
-    def get_v3_info(self, cls) -> NodeInfoV3:
-        input_dict = {}
-        output_dict = {}
-        hidden_list = []
-        # TODO: make sure dynamic types will be handled correctly
-        if self.inputs:
-            for input in self.inputs:
-                add_to_dict_v3(input, input_dict)
-        if self.outputs:
-            for output in self.outputs:
-                add_to_dict_v3(output, output_dict)
-        if self.hidden:
-            for hidden in self.hidden:
-                hidden_list.append(hidden.value)
-
-        info = NodeInfoV3(
-            input=input_dict,
-            output=output_dict,
-            hidden=hidden_list,
-            name=self.node_id,
-            display_name=self.display_name,
-            description=self.description,
-            category=self.category,
-            output_node=self.is_output_node,
-            deprecated=self.is_deprecated,
-            experimental=self.is_experimental,
-            api_node=self.is_api_node,
-            python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes"),
-            price_badge=self.price_badge.as_dict(self.inputs) if self.price_badge is not None else None,
+            search_aliases=self.search_aliases if self.search_aliases else None,
        )
        return info

@@ -1573,9 +1593,6 @@ def add_to_dict_v1(i: Input, d: dict):
    as_dict.pop("optional", None)
    d.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict)

-def add_to_dict_v3(io: Input | Output, d: dict):
-    d[io.id] = (io.get_io_type(), io.as_dict())
-
 class DynamicPathsDefaultValue:
    EMPTY_DICT = "empty_dict"

@@ -1736,13 +1753,6 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
        # set hidden
        type_clone.hidden = HiddenHolder.from_v3_data(v3_data)
        return type_clone
-
-    @final
-    @classmethod
-    def GET_NODE_INFO_V3(cls) -> dict[str, Any]:
-        schema = cls.GET_SCHEMA()
-        info = schema.get_v3_info(cls)
-        return asdict(info)
    #############################################
    # V1 Backwards Compatibility code
    #--------------------------------------------
@@ -1785,6 +1795,14 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
            cls.GET_SCHEMA()
        return cls._DEPRECATED

+    _DEV_ONLY = None
+    @final
+    @classproperty
+    def DEV_ONLY(cls):  # noqa
+        if cls._DEV_ONLY is None:
+            cls.GET_SCHEMA()
+        return cls._DEV_ONLY
+
    _API_NODE = None
    @final
    @classproperty
@@ -1849,6 +1867,14 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
            cls.GET_SCHEMA()
        return cls._NOT_IDEMPOTENT

+    _ACCEPT_ALL_INPUTS = None
+    @final
+    @classproperty
+    def ACCEPT_ALL_INPUTS(cls):  # noqa
+        if cls._ACCEPT_ALL_INPUTS is None:
+            cls.GET_SCHEMA()
+        return cls._ACCEPT_ALL_INPUTS
+
    @final
    @classmethod
    def INPUT_TYPES(cls) -> dict[str, dict]:
@@ -1879,6 +1905,8 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
            cls._EXPERIMENTAL = schema.is_experimental
        if cls._DEPRECATED is None:
            cls._DEPRECATED = schema.is_deprecated
+        if cls._DEV_ONLY is None:
+            cls._DEV_ONLY = schema.is_dev_only
        if cls._API_NODE is None:
            cls._API_NODE = schema.is_api_node
        if cls._OUTPUT_NODE is None:
@@ -1887,6 +1915,8 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
            cls._INPUT_IS_LIST = schema.is_input_list
        if cls._NOT_IDEMPOTENT is None:
            cls._NOT_IDEMPOTENT = schema.not_idempotent
+        if cls._ACCEPT_ALL_INPUTS is None:
+            cls._ACCEPT_ALL_INPUTS = schema.accept_all_inputs

        if cls._RETURN_TYPES is None:
            output = []
@@ -2034,6 +2064,7 @@ __all__ = [
    "ControlNet",
    "Vae",
    "Model",
+    "ModelPatch",
    "ClipVision",
    "ClipVisionOutput",
    "AudioEncoder",
@@ -2049,6 +2080,13 @@ __all__ = [
    "LossMap",
    "Voxel",
    "Mesh",
+    "File3DAny",
+    "File3DGLB",
+    "File3DGLTF",
+    "File3DFBX",
+    "File3DOBJ",
+    "File3DSTL",
+    "File3DUSDZ",
    "Hooks",
    "HookKeyframes",
    "TimestepsRange",
@@ -2066,6 +2104,7 @@ __all__ = [
    "AnyType",
    "MultiType",
    "Tracks",
+    "Color",
    # Dynamic Types
    "MatchType",
    "DynamicCombo",
@@ -2074,12 +2113,10 @@ __all__ = [
    "HiddenHolder",
    "Hidden",
    "NodeInfoV1",
-    "NodeInfoV3",
    "Schema",
    "ComfyNode",
    "NodeOutput",
    "add_to_dict_v1",
-    "add_to_dict_v3",
    "V3Data",
    "ImageCompare",
    "PriceBadgeDepends",
@@ -1,5 +1,5 @@
 from .video_types import VideoContainer, VideoCodec, VideoComponents
-from .geometry_types import VOXEL, MESH
+from .geometry_types import VOXEL, MESH, File3D
 from .image_types import SVG

 __all__ = [
@@ -9,5 +9,6 @@ __all__ = [
    "VideoComponents",
    "VOXEL",
    "MESH",
+    "File3D",
    "SVG",
 ]
@@ -1,3 +1,8 @@
+import shutil
+from io import BytesIO
+from pathlib import Path
+from typing import IO
+
 import torch


@@ -10,3 +15,75 @@ class MESH:
    def __init__(self, vertices: torch.Tensor, faces: torch.Tensor):
        self.vertices = vertices
        self.faces = faces
+
+
+class File3D:
+    """Class representing a 3D file from a file path or binary stream.
+
+    Supports both disk-backed (file path) and memory-backed (BytesIO) storage.
+    """
+
+    def __init__(self, source: str | IO[bytes], file_format: str = ""):
+        self._source = source
+        self._format = file_format or self._infer_format()
+
+    def _infer_format(self) -> str:
+        if isinstance(self._source, str):
+            return Path(self._source).suffix.lstrip(".").lower()
+        return ""
+
+    @property
+    def format(self) -> str:
+        return self._format
+
+    @format.setter
+    def format(self, value: str) -> None:
+        self._format = value.lstrip(".").lower() if value else ""
+
+    @property
+    def is_disk_backed(self) -> bool:
+        return isinstance(self._source, str)
+
+    def get_source(self) -> str | IO[bytes]:
+        if isinstance(self._source, str):
+            return self._source
+        if hasattr(self._source, "seek"):
+            self._source.seek(0)
+        return self._source
+
+    def get_data(self) -> BytesIO:
+        if isinstance(self._source, str):
+            with open(self._source, "rb") as f:
+                result = BytesIO(f.read())
+            return result
+        if hasattr(self._source, "seek"):
+            self._source.seek(0)
+        if isinstance(self._source, BytesIO):
+            return self._source
+        return BytesIO(self._source.read())
+
+    def save_to(self, path: str) -> str:
+        dest = Path(path)
+        dest.parent.mkdir(parents=True, exist_ok=True)
+
+        if isinstance(self._source, str):
+            if Path(self._source).resolve() != dest.resolve():
+                shutil.copy2(self._source, dest)
+        else:
+            if hasattr(self._source, "seek"):
+                self._source.seek(0)
+            with open(dest, "wb") as f:
+                f.write(self._source.read())
+        return str(dest)
+
+    def get_bytes(self) -> bytes:
+        if isinstance(self._source, str):
+            return Path(self._source).read_bytes()
+        if hasattr(self._source, "seek"):
+            self._source.seek(0)
+        return self._source.read()
+
+    def __repr__(self) -> str:
+        if isinstance(self._source, str):
+            return f"File3D(source={self._source!r}, format={self._format!r})"
+        return f"File3D(<stream>, format={self._format!r})"
@@ -13,17 +13,6 @@ class Text2ImageTaskCreationRequest(BaseModel):
    watermark: bool | None = Field(False)


-class Image2ImageTaskCreationRequest(BaseModel):
-    model: str = Field(...)
-    prompt: str = Field(...)
-    response_format: str | None = Field("url")
-    image: str = Field(..., description="Base64 encoded string or image URL")
-    size: str | None = Field("adaptive")
-    seed: int | None = Field(..., ge=0, le=2147483647)
-    guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
-    watermark: bool | None = Field(False)
-
-
 class Seedream4Options(BaseModel):
    max_images: int = Field(15)

@@ -0,0 +1,67 @@
+from pydantic import BaseModel, Field
+
+
+class ImageGenerationRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(...)
+    aspect_ratio: str = Field(...)
+    n: int = Field(...)
+    seed: int = Field(...)
+    response_for: str = Field("url")
+
+
+class InputUrlObject(BaseModel):
+    url: str = Field(...)
+
+
+class ImageEditRequest(BaseModel):
+    model: str = Field(...)
+    image: InputUrlObject = Field(...)
+    prompt: str = Field(...)
+    resolution: str = Field(...)
+    n: int = Field(...)
+    seed: int = Field(...)
+    response_for: str = Field("url")
+
+
+class VideoGenerationRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(...)
+    image: InputUrlObject | None = Field(...)
+    duration: int = Field(...)
+    aspect_ratio: str | None = Field(...)
+    resolution: str = Field(...)
+    seed: int = Field(...)
+
+
+class VideoEditRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(...)
+    video: InputUrlObject = Field(...)
+    seed: int = Field(...)
+
+
+class ImageResponseObject(BaseModel):
+    url: str | None = Field(None)
+    b64_json: str | None = Field(None)
+    revised_prompt: str | None = Field(None)
+
+
+class ImageGenerationResponse(BaseModel):
+    data: list[ImageResponseObject] = Field(...)
+
+
+class VideoGenerationResponse(BaseModel):
+    request_id: str = Field(...)
+
+
+class VideoResponseObject(BaseModel):
+    url: str = Field(...)
+    upsampled_prompt: str | None = Field(None)
+    duration: int = Field(...)
+
+
+class VideoStatusResponse(BaseModel):
+    status: str | None = Field(None)
+    video: VideoResponseObject | None = Field(None)
+    model: str | None = Field(None)
@@ -0,0 +1,51 @@
+from typing import TypedDict
+
+from pydantic import BaseModel, Field
+
+
+class InputVideoModel(TypedDict):
+    model: str
+    resolution: str
+
+
+class ImageEnhanceTaskCreateRequest(BaseModel):
+    model_name: str = Field(...)
+    img_url: str = Field(...)
+    extension: str = Field(".png")
+    exif: bool = Field(False)
+    DPI: int | None = Field(None)
+
+
+class VideoEnhanceTaskCreateRequest(BaseModel):
+    video_url: str = Field(...)
+    extension: str = Field(".mp4")
+    model_name: str | None = Field(...)
+    resolution: list[int] = Field(..., description="Target resolution [width, height]")
+    original_resolution: list[int] = Field(..., description="Original video resolution [width, height]")
+
+
+class TaskCreateDataResponse(BaseModel):
+    job_id: str = Field(...)
+    consume_coins: int | None = Field(None)
+
+
+class TaskStatusPollRequest(BaseModel):
+    job_id: str = Field(...)
+
+
+class TaskCreateResponse(BaseModel):
+    code: int = Field(...)
+    message: str = Field(...)
+    data: TaskCreateDataResponse | None = Field(None)
+
+
+class TaskStatusDataResponse(BaseModel):
+    job_id: str = Field(...)
+    status: str = Field(...)
+    res_url: str = Field("")
+
+
+class TaskStatusResponse(BaseModel):
+    code: int = Field(...)
+    message: str = Field(...)
+    data: TaskStatusDataResponse = Field(...)
@@ -0,0 +1,66 @@
+from typing import TypedDict
+
+from pydantic import BaseModel, Field, model_validator
+
+
+class InputGenerateType(TypedDict):
+    generate_type: str
+    polygon_type: str
+    pbr: bool
+
+
+class Hunyuan3DViewImage(BaseModel):
+    ViewType: str = Field(..., description="Valid values: back, left, right.")
+    ViewImageUrl: str = Field(...)
+
+
+class To3DProTaskRequest(BaseModel):
+    Model: str = Field(...)
+    Prompt: str | None = Field(None)
+    ImageUrl: str | None = Field(None)
+    MultiViewImages: list[Hunyuan3DViewImage] | None = Field(None)
+    EnablePBR: bool | None = Field(...)
+    FaceCount: int | None = Field(...)
+    GenerateType: str | None = Field(...)
+    PolygonType: str | None = Field(...)
+
+
+class RequestError(BaseModel):
+    Code: str = Field("")
+    Message: str = Field("")
+
+
+class To3DProTaskCreateResponse(BaseModel):
+    JobId: str | None = Field(None)
+    Error: RequestError | None = Field(None)
+
+    @model_validator(mode="before")
+    @classmethod
+    def unwrap_data(cls, values: dict) -> dict:
+        if "Response" in values and isinstance(values["Response"], dict):
+            return values["Response"]
+        return values
+
+
+class ResultFile3D(BaseModel):
+    Type: str = Field(...)
+    Url: str = Field(...)
+    PreviewImageUrl: str = Field("")
+
+
+class To3DProTaskResultResponse(BaseModel):
+    ErrorCode: str = Field("")
+    ErrorMessage: str = Field("")
+    ResultFile3Ds: list[ResultFile3D] = Field([])
+    Status: str = Field(...)
+
+    @model_validator(mode="before")
+    @classmethod
+    def unwrap_data(cls, values: dict) -> dict:
+        if "Response" in values and isinstance(values["Response"], dict):
+            return values["Response"]
+        return values
+
+
+class To3DProTaskQueryRequest(BaseModel):
+    JobId: str = Field(...)
@@ -0,0 +1,122 @@
+from typing import TypedDict
+
+from pydantic import AliasChoices, BaseModel, Field, model_validator
+
+
+class InputPortraitMode(TypedDict):
+    portrait_mode: str
+    portrait_style: str
+    portrait_beautifier: str
+
+
+class InputAdvancedSettings(TypedDict):
+    advanced_settings: str
+    whites: int
+    blacks: int
+    brightness: int
+    contrast: int
+    saturation: int
+    engine: str
+    transfer_light_a: str
+    transfer_light_b: str
+    fixed_generation: bool
+
+
+class InputSkinEnhancerMode(TypedDict):
+    mode: str
+    skin_detail: int
+    optimized_for: str
+
+
+class ImageUpscalerCreativeRequest(BaseModel):
+    image: str = Field(...)
+    scale_factor: str = Field(...)
+    optimized_for: str = Field(...)
+    prompt: str | None = Field(None)
+    creativity: int = Field(...)
+    hdr: int = Field(...)
+    resemblance: int = Field(...)
+    fractality: int = Field(...)
+    engine: str = Field(...)
+
+
+class ImageUpscalerPrecisionV2Request(BaseModel):
+    image: str = Field(...)
+    sharpen: int = Field(...)
+    smart_grain: int = Field(...)
+    ultra_detail: int = Field(...)
+    flavor: str = Field(...)
+    scale_factor: int = Field(...)
+
+
+class ImageRelightAdvancedSettingsRequest(BaseModel):
+    whites: int = Field(...)
+    blacks: int = Field(...)
+    brightness: int = Field(...)
+    contrast: int = Field(...)
+    saturation: int = Field(...)
+    engine: str = Field(...)
+    transfer_light_a: str = Field(...)
+    transfer_light_b: str = Field(...)
+    fixed_generation: bool = Field(...)
+
+
+class ImageRelightRequest(BaseModel):
+    image: str = Field(...)
+    prompt: str | None = Field(None)
+    transfer_light_from_reference_image: str | None = Field(None)
+    light_transfer_strength: int = Field(...)
+    interpolate_from_original: bool = Field(...)
+    change_background: bool = Field(...)
+    style: str = Field(...)
+    preserve_details: bool = Field(...)
+    advanced_settings: ImageRelightAdvancedSettingsRequest | None = Field(...)
+
+
+class ImageStyleTransferRequest(BaseModel):
+    image: str = Field(...)
+    reference_image: str = Field(...)
+    prompt: str | None = Field(None)
+    style_strength: int = Field(...)
+    structure_strength: int = Field(...)
+    is_portrait: bool = Field(...)
+    portrait_style: str | None = Field(...)
+    portrait_beautifier: str | None = Field(...)
+    flavor: str = Field(...)
+    engine: str = Field(...)
+    fixed_generation: bool = Field(...)
+
+
+class ImageSkinEnhancerCreativeRequest(BaseModel):
+    image: str = Field(...)
+    sharpen: int = Field(...)
+    smart_grain: int = Field(...)
+
+
+class ImageSkinEnhancerFaithfulRequest(BaseModel):
+    image: str = Field(...)
+    sharpen: int = Field(...)
+    smart_grain: int = Field(...)
+    skin_detail: int = Field(...)
+
+
+class ImageSkinEnhancerFlexibleRequest(BaseModel):
+    image: str = Field(...)
+    sharpen: int = Field(...)
+    smart_grain: int = Field(...)
+    optimized_for: str = Field(...)
+
+
+class TaskResponse(BaseModel):
+    """Unified response model that handles both wrapped and unwrapped API responses."""
+
+    task_id: str = Field(...)
+    status: str = Field(validation_alias=AliasChoices("status", "task_status"))
+    generated: list[str] | None = Field(None)
+
+    @model_validator(mode="before")
+    @classmethod
+    def unwrap_data(cls, values: dict) -> dict:
+        if "data" in values and isinstance(values["data"], dict):
+            return values["data"]
+        return values
@@ -109,14 +109,19 @@ class MeshyTextureRequest(BaseModel):

 class MeshyModelsUrls(BaseModel):
    glb: str = Field("")
+    fbx: str = Field("")
+    usdz: str = Field("")
+    obj: str = Field("")


 class MeshyRiggedModelsUrls(BaseModel):
    rigged_character_glb_url: str = Field("")
+    rigged_character_fbx_url: str = Field("")


 class MeshyAnimatedModelsUrls(BaseModel):
    animation_glb_url: str = Field("")
+    animation_fbx_url: str = Field("")


 class MeshyResultTextureUrls(BaseModel):
@@ -1,11 +1,8 @@
 from __future__ import annotations

-
-
 from enum import Enum
-from typing import Optional

-from pydantic import BaseModel, Field, conint, confloat
+from pydantic import BaseModel, Field


 class RecraftColor:
@@ -229,24 +226,24 @@ class RecraftColorObject(BaseModel):


 class RecraftControlsObject(BaseModel):
-    colors: Optional[list[RecraftColorObject]] = Field(None, description='An array of preferable colors')
-    background_color: Optional[RecraftColorObject] = Field(None, description='Use given color as a desired background color')
-    no_text: Optional[bool] = Field(None, description='Do not embed text layouts')
-    artistic_level: Optional[conint(ge=0, le=5)] = Field(None, description='Defines artistic tone of your image. At a simple level, the person looks straight at the camera in a static and clean style. Dynamic and eccentric levels introduce movement and creativity. The value should be in range [0..5].')
+    colors: list[RecraftColorObject] | None = Field(None, description='An array of preferable colors')
+    background_color: RecraftColorObject | None = Field(None, description='Use given color as a desired background color')
+    no_text: bool | None = Field(None, description='Do not embed text layouts')
+    artistic_level: int | None = Field(None, description='Defines artistic tone of your image. At a simple level, the person looks straight at the camera in a static and clean style. Dynamic and eccentric levels introduce movement and creativity. The value should be in range [0..5].')


 class RecraftImageGenerationRequest(BaseModel):
    prompt: str = Field(..., description='The text prompt describing the image to generate')
-    size: Optional[RecraftImageSize] = Field(None, description='The size of the generated image (e.g., "1024x1024")')
-    n: conint(ge=1, le=6) = Field(..., description='The number of images to generate')
-    negative_prompt: Optional[str] = Field(None, description='A text description of undesired elements on an image')
-    model: Optional[RecraftModel] = Field(RecraftModel.recraftv3, description='The model to use for generation (e.g., "recraftv3")')
-    style: Optional[str] = Field(None, description='The style to apply to the generated image (e.g., "digital_illustration")')
-    substyle: Optional[str] = Field(None, description='The substyle to apply to the generated image, depending on the style input')
-    controls: Optional[RecraftControlsObject] = Field(None, description='A set of custom parameters to tweak generation process')
-    style_id: Optional[str] = Field(None, description='Use a previously uploaded style as a reference; UUID')
-    strength: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description='Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity')
-    random_seed: Optional[int] = Field(None, description="Seed for video generation")
+    size: RecraftImageSize | None = Field(None, description='The size of the generated image (e.g., "1024x1024")')
+    n: int = Field(..., description='The number of images to generate')
+    negative_prompt: str | None = Field(None, description='A text description of undesired elements on an image')
+    model: RecraftModel | None = Field(RecraftModel.recraftv3, description='The model to use for generation (e.g., "recraftv3")')
+    style: str | None = Field(None, description='The style to apply to the generated image (e.g., "digital_illustration")')
+    substyle: str | None = Field(None, description='The substyle to apply to the generated image, depending on the style input')
+    controls: RecraftControlsObject | None = Field(None, description='A set of custom parameters to tweak generation process')
+    style_id: str | None = Field(None, description='Use a previously uploaded style as a reference; UUID')
+    strength: float | None = Field(None, description='Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity')
+    random_seed: int | None = Field(None, description="Seed for video generation")
    # text_layout


@@ -258,5 +255,13 @@ class RecraftReturnedObject(BaseModel):
 class RecraftImageGenerationResponse(BaseModel):
    created: int = Field(..., description='Unix timestamp when the generation was created')
    credits: int = Field(..., description='Number of credits used for the generation')
-    data: Optional[list[RecraftReturnedObject]] = Field(None, description='Array of generated image information')
-    image: Optional[RecraftReturnedObject] = Field(None, description='Single generated image')
+    data: list[RecraftReturnedObject] | None = Field(None, description='Array of generated image information')
+    image: RecraftReturnedObject | None = Field(None, description='Single generated image')
+
+
+class RecraftCreateStyleRequest(BaseModel):
+    style: str = Field(..., description="realistic_image, digital_illustration, vector_illustration, or icon")
+
+
+class RecraftCreateStyleResponse(BaseModel):
+    id: str = Field(..., description="UUID of the created style")
@@ -6,6 +6,30 @@ class SubjectReference(BaseModel):
    images: list[str] = Field(...)


+class FrameSetting(BaseModel):
+    prompt: str = Field(...)
+    key_image: str = Field(...)
+    duration: int = Field(...)
+
+
+class TaskMultiFrameCreationRequest(BaseModel):
+    model: str = Field(...)
+    seed: int = Field(..., ge=0, le=2147483647)
+    resolution: str = Field(...)
+    start_image: str = Field(...)
+    image_settings: list[FrameSetting] = Field(...)
+
+
+class TaskExtendCreationRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(..., max_length=2000)
+    duration: int = Field(...)
+    seed: int = Field(..., ge=0, le=2147483647)
+    resolution: str = Field(...)
+    images: list[str] | None = Field(None, description="Base64 encoded string or image URL")
+    video_url: str = Field(..., description="URL of the video to extend")
+
+
 class TaskCreationRequest(BaseModel):
    model: str = Field(...)
    prompt: str = Field(..., max_length=2000)
@@ -0,0 +1,35 @@
+from pydantic import BaseModel, Field
+
+
+class SeedVR2ImageRequest(BaseModel):
+    image: str = Field(...)
+    target_resolution: str = Field(...)
+    output_format: str = Field("png")
+    enable_sync_mode: bool = Field(False)
+
+
+class FlashVSRRequest(BaseModel):
+    target_resolution: str = Field(...)
+    video: str = Field(...)
+    duration: float = Field(...)
+
+
+class TaskCreatedDataResponse(BaseModel):
+    id: str = Field(...)
+
+
+class TaskCreatedResponse(BaseModel):
+    code: int = Field(...)
+    message: str = Field(...)
+    data: TaskCreatedDataResponse | None = Field(None)
+
+
+class TaskResultDataResponse(BaseModel):
+    status: str = Field(...)
+    outputs: list[str] = Field([])
+
+
+class TaskResultResponse(BaseModel):
+    code: int = Field(...)
+    message: str = Field(...)
+    data: TaskResultDataResponse | None = Field(None)
@@ -24,7 +24,7 @@ class BriaImageEditNode(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="BriaImageEditNode",
-            display_name="Bria Image Edit",
+            display_name="Bria FIBO Image Edit",
            category="api node/image/Bria",
            description="Edit images using Bria latest model",
            inputs=[
@@ -9,7 +9,6 @@ from comfy_api_nodes.apis.bytedance import (
    RECOMMENDED_PRESETS,
    RECOMMENDED_PRESETS_SEEDREAM_4,
    VIDEO_TASKS_EXECUTION_TIME,
-    Image2ImageTaskCreationRequest,
    Image2VideoTaskCreationRequest,
    ImageTaskCreationResponse,
    Seedream4Options,
@@ -174,99 +173,6 @@ class ByteDanceImageNode(IO.ComfyNode):
        return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))


-class ByteDanceImageEditNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="ByteDanceImageEditNode",
-            display_name="ByteDance Image Edit",
-            category="api node/image/ByteDance",
-            description="Edit images using ByteDance models via api based on prompt",
-            inputs=[
-                IO.Combo.Input("model", options=["seededit-3-0-i2i-250628"]),
-                IO.Image.Input(
-                    "image",
-                    tooltip="The base image to edit",
-                ),
-                IO.String.Input(
-                    "prompt",
-                    multiline=True,
-                    default="",
-                    tooltip="Instruction to edit image",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    step=1,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed to use for generation",
-                    optional=True,
-                ),
-                IO.Float.Input(
-                    "guidance_scale",
-                    default=5.5,
-                    min=1.0,
-                    max=10.0,
-                    step=0.01,
-                    display_mode=IO.NumberDisplay.number,
-                    tooltip="Higher value makes the image follow the prompt more closely",
-                    optional=True,
-                ),
-                IO.Boolean.Input(
-                    "watermark",
-                    default=False,
-                    tooltip='Whether to add an "AI generated" watermark to the image',
-                    optional=True,
-                ),
-            ],
-            outputs=[
-                IO.Image.Output(),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            is_deprecated=True,
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model: str,
-        image: Input.Image,
-        prompt: str,
-        seed: int,
-        guidance_scale: float,
-        watermark: bool,
-    ) -> IO.NodeOutput:
-        validate_string(prompt, strip_whitespace=True, min_length=1)
-        if get_number_of_images(image) != 1:
-            raise ValueError("Exactly one input image is required.")
-        validate_image_aspect_ratio(image, (1, 3), (3, 1))
-        source_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0]
-        payload = Image2ImageTaskCreationRequest(
-            model=model,
-            prompt=prompt,
-            image=source_url,
-            seed=seed,
-            guidance_scale=guidance_scale,
-            watermark=watermark,
-        )
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"),
-            data=payload,
-            response_model=ImageTaskCreationResponse,
-        )
-        return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
-
-
 class ByteDanceSeedreamNode(IO.ComfyNode):

    @classmethod
@@ -1101,7 +1007,6 @@ class ByteDanceExtension(ComfyExtension):
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [
            ByteDanceImageNode,
-            ByteDanceImageEditNode,
            ByteDanceSeedreamNode,
            ByteDanceTextToVideoNode,
            ByteDanceImageToVideoNode,
@@ -0,0 +1,417 @@
+import torch
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.grok import (
+    ImageEditRequest,
+    ImageGenerationRequest,
+    ImageGenerationResponse,
+    InputUrlObject,
+    VideoEditRequest,
+    VideoGenerationRequest,
+    VideoGenerationResponse,
+    VideoStatusResponse,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_image_tensor,
+    download_url_to_video_output,
+    get_fs_object_size,
+    get_number_of_images,
+    poll_op,
+    sync_op,
+    tensor_to_base64_string,
+    upload_video_to_comfyapi,
+    validate_string,
+    validate_video_duration,
+)
+
+
+class GrokImageNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GrokImageNode",
+            display_name="Grok Image",
+            category="api node/image/Grok",
+            description="Generate images using Grok based on a text prompt",
+            inputs=[
+                IO.Combo.Input("model", options=["grok-imagine-image-beta"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="The text prompt used to generate the image",
+                ),
+                IO.Combo.Input(
+                    "aspect_ratio",
+                    options=[
+                        "1:1",
+                        "2:3",
+                        "3:2",
+                        "3:4",
+                        "4:3",
+                        "9:16",
+                        "16:9",
+                        "9:19.5",
+                        "19.5:9",
+                        "9:20",
+                        "20:9",
+                        "1:2",
+                        "2:1",
+                    ],
+                ),
+                IO.Int.Input(
+                    "number_of_images",
+                    default=1,
+                    min=1,
+                    max=10,
+                    step=1,
+                    tooltip="Number of images to generate",
+                    display_mode=IO.NumberDisplay.number,
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to determine if node should re-run; "
+                    "actual results are nondeterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["number_of_images"]),
+                expr="""{"type":"usd","usd":0.033 * widgets.number_of_images}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        aspect_ratio: str,
+        number_of_images: int,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/xai/v1/images/generations", method="POST"),
+            data=ImageGenerationRequest(
+                model=model,
+                prompt=prompt,
+                aspect_ratio=aspect_ratio,
+                n=number_of_images,
+                seed=seed,
+            ),
+            response_model=ImageGenerationResponse,
+        )
+        if len(response.data) == 1:
+            return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url))
+        return IO.NodeOutput(
+            torch.cat(
+                [await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]],
+            )
+        )
+
+
+class GrokImageEditNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GrokImageEditNode",
+            display_name="Grok Image Edit",
+            category="api node/image/Grok",
+            description="Modify an existing image based on a text prompt",
+            inputs=[
+                IO.Combo.Input("model", options=["grok-imagine-image-beta"]),
+                IO.Image.Input("image"),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="The text prompt used to generate the image",
+                ),
+                IO.Combo.Input("resolution", options=["1K"]),
+                IO.Int.Input(
+                    "number_of_images",
+                    default=1,
+                    min=1,
+                    max=10,
+                    step=1,
+                    tooltip="Number of edited images to generate",
+                    display_mode=IO.NumberDisplay.number,
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to determine if node should re-run; "
+                    "actual results are nondeterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["number_of_images"]),
+                expr="""{"type":"usd","usd":0.002 + 0.033 * widgets.number_of_images}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: Input.Image,
+        prompt: str,
+        resolution: str,
+        number_of_images: int,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        if get_number_of_images(image) != 1:
+            raise ValueError("Only one input image is supported.")
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/xai/v1/images/edits", method="POST"),
+            data=ImageEditRequest(
+                model=model,
+                image=InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(image)}"),
+                prompt=prompt,
+                resolution=resolution.lower(),
+                n=number_of_images,
+                seed=seed,
+            ),
+            response_model=ImageGenerationResponse,
+        )
+        if len(response.data) == 1:
+            return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url))
+        return IO.NodeOutput(
+            torch.cat(
+                [await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]],
+            )
+        )
+
+
+class GrokVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GrokVideoNode",
+            display_name="Grok Video",
+            category="api node/video/Grok",
+            description="Generate video from a prompt or an image",
+            inputs=[
+                IO.Combo.Input("model", options=["grok-imagine-video-beta"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="Text description of the desired video.",
+                ),
+                IO.Combo.Input(
+                    "resolution",
+                    options=["480p", "720p"],
+                    tooltip="The resolution of the output video.",
+                ),
+                IO.Combo.Input(
+                    "aspect_ratio",
+                    options=["auto", "16:9", "4:3", "3:2", "1:1", "2:3", "3:4", "9:16"],
+                    tooltip="The aspect ratio of the output video.",
+                ),
+                IO.Int.Input(
+                    "duration",
+                    default=6,
+                    min=1,
+                    max=15,
+                    step=1,
+                    tooltip="The duration of the output video in seconds.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to determine if node should re-run; "
+                    "actual results are nondeterministic regardless of seed.",
+                ),
+                IO.Image.Input("image", optional=True),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["duration"], inputs=["image"]),
+                expr="""
+                (
+                  $base := 0.181 * widgets.duration;
+                  {"type":"usd","usd": inputs.image.connected ? $base + 0.002 : $base}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        resolution: str,
+        aspect_ratio: str,
+        duration: int,
+        seed: int,
+        image: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        image_url = None
+        if image is not None:
+            if get_number_of_images(image) != 1:
+                raise ValueError("Only one input image is supported.")
+            image_url = InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(image)}")
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/xai/v1/videos/generations", method="POST"),
+            data=VideoGenerationRequest(
+                model=model,
+                image=image_url,
+                prompt=prompt,
+                resolution=resolution,
+                duration=duration,
+                aspect_ratio=None if aspect_ratio == "auto" else aspect_ratio,
+                seed=seed,
+            ),
+            response_model=VideoGenerationResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/xai/v1/videos/{initial_response.request_id}"),
+            status_extractor=lambda r: r.status if r.status is not None else "complete",
+            response_model=VideoStatusResponse,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.video.url))
+
+
+class GrokVideoEditNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GrokVideoEditNode",
+            display_name="Grok Video Edit",
+            category="api node/video/Grok",
+            description="Edit an existing video based on a text prompt.",
+            inputs=[
+                IO.Combo.Input("model", options=["grok-imagine-video-beta"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="Text description of the desired video.",
+                ),
+                IO.Video.Input("video", tooltip="Maximum supported duration is 8.7 seconds and 50MB file size."),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to determine if node should re-run; "
+                    "actual results are nondeterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd": 0.191, "format": {"suffix": "/sec", "approximate": true}}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        video: Input.Video,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        validate_video_duration(video, min_duration=1, max_duration=8.7)
+        video_stream = video.get_stream_source()
+        video_size = get_fs_object_size(video_stream)
+        if video_size > 50 * 1024 * 1024:
+            raise ValueError(f"Video size ({video_size / 1024 / 1024:.1f}MB) exceeds 50MB limit.")
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/xai/v1/videos/edits", method="POST"),
+            data=VideoEditRequest(
+                model=model,
+                video=InputUrlObject(url=await upload_video_to_comfyapi(cls, video)),
+                prompt=prompt,
+                seed=seed,
+            ),
+            response_model=VideoGenerationResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/xai/v1/videos/{initial_response.request_id}"),
+            status_extractor=lambda r: r.status if r.status is not None else "complete",
+            response_model=VideoStatusResponse,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.video.url))
+
+
+class GrokExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            GrokImageNode,
+            GrokImageEditNode,
+            GrokVideoNode,
+            GrokVideoEditNode,
+        ]
+
+
+async def comfy_entrypoint() -> GrokExtension:
+    return GrokExtension()
@@ -0,0 +1,342 @@
+import math
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.hitpaw import (
+    ImageEnhanceTaskCreateRequest,
+    InputVideoModel,
+    TaskCreateDataResponse,
+    TaskCreateResponse,
+    TaskStatusPollRequest,
+    TaskStatusResponse,
+    VideoEnhanceTaskCreateRequest,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_image_tensor,
+    download_url_to_video_output,
+    downscale_image_tensor,
+    get_image_dimensions,
+    poll_op,
+    sync_op,
+    upload_image_to_comfyapi,
+    upload_video_to_comfyapi,
+    validate_video_duration,
+)
+
+VIDEO_MODELS_MODELS_MAP = {
+    "Portrait Restore Model (1x)": "portrait_restore_1x",
+    "Portrait Restore Model (2x)": "portrait_restore_2x",
+    "General Restore Model (1x)": "general_restore_1x",
+    "General Restore Model (2x)": "general_restore_2x",
+    "General Restore Model (4x)": "general_restore_4x",
+    "Ultra HD Model (2x)": "ultrahd_restore_2x",
+    "Generative Model (1x)": "generative_1x",
+}
+
+# Resolution name to target dimension (shorter side) in pixels
+RESOLUTION_TARGET_MAP = {
+    "720p": 720,
+    "1080p": 1080,
+    "2K/QHD": 1440,
+    "4K/UHD": 2160,
+    "8K": 4320,
+}
+
+# Square (1:1) resolutions use standard square dimensions
+RESOLUTION_SQUARE_MAP = {
+    "720p": 720,
+    "1080p": 1080,
+    "2K/QHD": 1440,
+    "4K/UHD": 2048,  # DCI 4K square
+    "8K": 4096,  # DCI 8K square
+}
+
+# Models with limited resolution support (no 8K)
+LIMITED_RESOLUTION_MODELS = {"Generative Model (1x)"}
+
+# Resolution options for different model types
+RESOLUTIONS_LIMITED = ["original", "720p", "1080p", "2K/QHD", "4K/UHD"]
+RESOLUTIONS_FULL = ["original", "720p", "1080p", "2K/QHD", "4K/UHD", "8K"]
+
+# Maximum output resolution in pixels
+MAX_PIXELS_GENERATIVE = 32_000_000
+MAX_MP_GENERATIVE = MAX_PIXELS_GENERATIVE // 1_000_000
+
+
+class HitPawGeneralImageEnhance(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="HitPawGeneralImageEnhance",
+            display_name="HitPaw General Image Enhance",
+            category="api node/image/HitPaw",
+            description="Upscale low-resolution images to super-resolution, eliminate artifacts and noise. "
+            f"Maximum output: {MAX_MP_GENERATIVE} megapixels.",
+            inputs=[
+                IO.Combo.Input("model", options=["generative_portrait", "generative"]),
+                IO.Image.Input("image"),
+                IO.Combo.Input("upscale_factor", options=[1, 2, 4]),
+                IO.Boolean.Input(
+                    "auto_downscale",
+                    default=False,
+                    tooltip="Automatically downscale input image if output would exceed the limit.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+                expr="""
+                (
+                  $prices := {
+                    "generative_portrait": {"min": 0.02, "max": 0.06},
+                    "generative": {"min": 0.05, "max": 0.15}
+                  };
+                  $price := $lookup($prices, widgets.model);
+                  {
+                    "type": "range_usd",
+                    "min_usd": $price.min,
+                    "max_usd": $price.max
+                  }
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: Input.Image,
+        upscale_factor: int,
+        auto_downscale: bool,
+    ) -> IO.NodeOutput:
+        height, width = get_image_dimensions(image)
+        requested_scale = upscale_factor
+        output_pixels = height * width * requested_scale * requested_scale
+        if output_pixels > MAX_PIXELS_GENERATIVE:
+            if auto_downscale:
+                input_pixels = width * height
+                scale = 1
+                max_input_pixels = MAX_PIXELS_GENERATIVE
+
+                for candidate in [4, 2, 1]:
+                    if candidate > requested_scale:
+                        continue
+                    scale_output_pixels = input_pixels * candidate * candidate
+                    if scale_output_pixels <= MAX_PIXELS_GENERATIVE:
+                        scale = candidate
+                        max_input_pixels = None
+                        break
+                    # Check if we can downscale input by at most 2x to fit
+                    downscale_ratio = math.sqrt(scale_output_pixels / MAX_PIXELS_GENERATIVE)
+                    if downscale_ratio <= 2.0:
+                        scale = candidate
+                        max_input_pixels = MAX_PIXELS_GENERATIVE // (candidate * candidate)
+                        break
+
+                if max_input_pixels is not None:
+                    image = downscale_image_tensor(image, total_pixels=max_input_pixels)
+                upscale_factor = scale
+            else:
+                output_width = width * requested_scale
+                output_height = height * requested_scale
+                raise ValueError(
+                    f"Output size ({output_width}x{output_height} = {output_pixels:,} pixels) "
+                    f"exceeds maximum allowed size of {MAX_PIXELS_GENERATIVE:,} pixels ({MAX_MP_GENERATIVE}MP). "
+                    f"Enable auto_downscale or use a smaller input image or a lower upscale factor."
+                )
+
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/hitpaw/api/photo-enhancer", method="POST"),
+            response_model=TaskCreateResponse,
+            data=ImageEnhanceTaskCreateRequest(
+                model_name=f"{model}_{upscale_factor}x",
+                img_url=await upload_image_to_comfyapi(cls, image, total_pixels=None),
+            ),
+            wait_label="Creating task",
+            final_label_on_success="Task created",
+        )
+        if initial_res.code != 200:
+            raise ValueError(f"Task creation failed with code {initial_res.code}: {initial_res.message}")
+        request_price = initial_res.data.consume_coins / 1000
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path="/proxy/hitpaw/api/task-status", method="POST"),
+            data=TaskCreateDataResponse(job_id=initial_res.data.job_id),
+            response_model=TaskStatusResponse,
+            status_extractor=lambda x: x.data.status,
+            price_extractor=lambda x: request_price,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.res_url))
+
+
+class HitPawVideoEnhance(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        model_options = []
+        for model_name in VIDEO_MODELS_MODELS_MAP:
+            if model_name in LIMITED_RESOLUTION_MODELS:
+                resolutions = RESOLUTIONS_LIMITED
+            else:
+                resolutions = RESOLUTIONS_FULL
+            model_options.append(
+                IO.DynamicCombo.Option(
+                    model_name,
+                    [IO.Combo.Input("resolution", options=resolutions)],
+                )
+            )
+
+        return IO.Schema(
+            node_id="HitPawVideoEnhance",
+            display_name="HitPaw Video Enhance",
+            category="api node/video/HitPaw",
+            description="Upscale low-resolution videos to high resolution, eliminate artifacts and noise. "
+            "Prices shown are per second of video.",
+            inputs=[
+                IO.DynamicCombo.Input("model", options=model_options),
+                IO.Video.Input("video"),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution"]),
+                expr="""
+                (
+                  $m := $lookup(widgets, "model");
+                  $res := $lookup(widgets, "model.resolution");
+                  $standard_model_prices := {
+                    "original": {"min": 0.01, "max": 0.198},
+                    "720p": {"min": 0.01, "max": 0.06},
+                    "1080p": {"min": 0.015, "max": 0.09},
+                    "2k/qhd": {"min": 0.02, "max": 0.117},
+                    "4k/uhd": {"min": 0.025, "max": 0.152},
+                    "8k": {"min": 0.033, "max": 0.198}
+                  };
+                  $ultra_hd_model_prices := {
+                    "original": {"min": 0.015, "max": 0.264},
+                    "720p": {"min": 0.015, "max": 0.092},
+                    "1080p": {"min": 0.02, "max": 0.12},
+                    "2k/qhd": {"min": 0.026, "max": 0.156},
+                    "4k/uhd": {"min": 0.034, "max": 0.203},
+                    "8k": {"min": 0.044, "max": 0.264}
+                  };
+                  $generative_model_prices := {
+                    "original": {"min": 0.015, "max": 0.338},
+                    "720p": {"min": 0.008, "max": 0.090},
+                    "1080p": {"min": 0.05, "max": 0.15},
+                    "2k/qhd": {"min": 0.038, "max": 0.225},
+                    "4k/uhd": {"min": 0.056, "max": 0.338}
+                  };
+                  $prices := $contains($m, "ultra hd") ? $ultra_hd_model_prices :
+                             $contains($m, "generative") ? $generative_model_prices :
+                             $standard_model_prices;
+                  $price := $lookup($prices, $res);
+                  {
+                    "type": "range_usd",
+                    "min_usd": $price.min,
+                    "max_usd": $price.max,
+                    "format": {"approximate": true, "suffix": "/second"}
+                  }
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: InputVideoModel,
+        video: Input.Video,
+    ) -> IO.NodeOutput:
+        validate_video_duration(video, min_duration=0.5, max_duration=60 * 60)
+        resolution = model["resolution"]
+        src_width, src_height = video.get_dimensions()
+
+        if resolution == "original":
+            output_width = src_width
+            output_height = src_height
+        else:
+            if src_width == src_height:
+                target_size = RESOLUTION_SQUARE_MAP[resolution]
+                if target_size < src_width:
+                    raise ValueError(
+                        f"Selected resolution {resolution} ({target_size}x{target_size}) is smaller than "
+                        f"the input video ({src_width}x{src_height}). Please select a higher resolution or 'original'."
+                    )
+                output_width = target_size
+                output_height = target_size
+            else:
+                min_dimension = min(src_width, src_height)
+                target_size = RESOLUTION_TARGET_MAP[resolution]
+                if target_size < min_dimension:
+                    raise ValueError(
+                        f"Selected resolution {resolution} ({target_size}p) is smaller than "
+                        f"the input video's shorter dimension ({min_dimension}p). "
+                        f"Please select a higher resolution or 'original'."
+                    )
+                if src_width > src_height:
+                    output_height = target_size
+                    output_width = int(target_size * (src_width / src_height))
+                else:
+                    output_width = target_size
+                    output_height = int(target_size * (src_height / src_width))
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/hitpaw/api/video-enhancer", method="POST"),
+            response_model=TaskCreateResponse,
+            data=VideoEnhanceTaskCreateRequest(
+                video_url=await upload_video_to_comfyapi(cls, video),
+                resolution=[output_width, output_height],
+                original_resolution=[src_width, src_height],
+                model_name=VIDEO_MODELS_MODELS_MAP[model["model"]],
+            ),
+            wait_label="Creating task",
+            final_label_on_success="Task created",
+        )
+        request_price = initial_res.data.consume_coins / 1000
+        if initial_res.code != 200:
+            raise ValueError(f"Task creation failed with code {initial_res.code}: {initial_res.message}")
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path="/proxy/hitpaw/api/task-status", method="POST"),
+            data=TaskStatusPollRequest(job_id=initial_res.data.job_id),
+            response_model=TaskStatusResponse,
+            status_extractor=lambda x: x.data.status,
+            price_extractor=lambda x: request_price,
+            poll_interval=10.0,
+            max_poll_attempts=320,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(final_response.data.res_url))
+
+
+class HitPawExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            HitPawGeneralImageEnhance,
+            HitPawVideoEnhance,
+        ]
+
+
+async def comfy_entrypoint() -> HitPawExtension:
+    return HitPawExtension()
@@ -0,0 +1,300 @@
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.hunyuan3d import (
+    Hunyuan3DViewImage,
+    InputGenerateType,
+    ResultFile3D,
+    To3DProTaskCreateResponse,
+    To3DProTaskQueryRequest,
+    To3DProTaskRequest,
+    To3DProTaskResultResponse,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_file_3d,
+    downscale_image_tensor_by_max_side,
+    poll_op,
+    sync_op,
+    upload_image_to_comfyapi,
+    validate_image_dimensions,
+    validate_string,
+)
+
+
+def get_file_from_response(response_objs: list[ResultFile3D], file_type: str) -> ResultFile3D | None:
+    for i in response_objs:
+        if i.Type.lower() == file_type.lower():
+            return i
+    return None
+
+
+class TencentTextToModelNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="TencentTextToModelNode",
+            display_name="Hunyuan3D: Text to Model (Pro)",
+            category="api node/3d/Tencent",
+            inputs=[
+                IO.Combo.Input(
+                    "model",
+                    options=["3.0", "3.1"],
+                    tooltip="The LowPoly option is unavailable for the `3.1` model.",
+                ),
+                IO.String.Input("prompt", multiline=True, default="", tooltip="Supports up to 1024 characters."),
+                IO.Int.Input("face_count", default=500000, min=40000, max=1500000),
+                IO.DynamicCombo.Input(
+                    "generate_type",
+                    options=[
+                        IO.DynamicCombo.Option("Normal", [IO.Boolean.Input("pbr", default=False)]),
+                        IO.DynamicCombo.Option(
+                            "LowPoly",
+                            [
+                                IO.Combo.Input("polygon_type", options=["triangle", "quadrilateral"]),
+                                IO.Boolean.Input("pbr", default=False),
+                            ],
+                        ),
+                        IO.DynamicCombo.Option("Geometry", []),
+                    ],
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DOBJ.Output(display_name="OBJ"),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            is_output_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["generate_type", "generate_type.pbr", "face_count"]),
+                expr="""
+                (
+                  $base := widgets.generate_type = "normal" ? 25 : widgets.generate_type = "lowpoly" ? 30 : 15;
+                  $pbr := $lookup(widgets, "generate_type.pbr") ? 10 : 0;
+                  $face := widgets.face_count != 500000 ? 10 : 0;
+                  {"type":"usd","usd": ($base + $pbr + $face) * 0.02}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        face_count: int,
+        generate_type: InputGenerateType,
+        seed: int,
+    ) -> IO.NodeOutput:
+        _ = seed
+        validate_string(prompt, field_name="prompt", min_length=1, max_length=1024)
+        if model == "3.1" and generate_type["generate_type"].lower() == "lowpoly":
+            raise ValueError("The LowPoly option is currently unavailable for the 3.1 model.")
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro", method="POST"),
+            response_model=To3DProTaskCreateResponse,
+            data=To3DProTaskRequest(
+                Model=model,
+                Prompt=prompt,
+                FaceCount=face_count,
+                GenerateType=generate_type["generate_type"],
+                EnablePBR=generate_type.get("pbr", None),
+                PolygonType=generate_type.get("polygon_type", None),
+            ),
+        )
+        if response.Error:
+            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
+        task_id = response.JobId
+        result = await poll_op(
+            cls,
+            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro/query", method="POST"),
+            data=To3DProTaskQueryRequest(JobId=task_id),
+            response_model=To3DProTaskResultResponse,
+            status_extractor=lambda r: r.Status,
+        )
+        glb_result = get_file_from_response(result.ResultFile3Ds, "glb")
+        obj_result = get_file_from_response(result.ResultFile3Ds, "obj")
+        file_glb = await download_url_to_file_3d(glb_result.Url, "glb", task_id=task_id) if glb_result else None
+        return IO.NodeOutput(
+            file_glb, file_glb, await download_url_to_file_3d(obj_result.Url, "obj", task_id=task_id) if obj_result else None
+        )
+
+
+class TencentImageToModelNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="TencentImageToModelNode",
+            display_name="Hunyuan3D: Image(s) to Model (Pro)",
+            category="api node/3d/Tencent",
+            inputs=[
+                IO.Combo.Input(
+                    "model",
+                    options=["3.0", "3.1"],
+                    tooltip="The LowPoly option is unavailable for the `3.1` model.",
+                ),
+                IO.Image.Input("image"),
+                IO.Image.Input("image_left", optional=True),
+                IO.Image.Input("image_right", optional=True),
+                IO.Image.Input("image_back", optional=True),
+                IO.Int.Input("face_count", default=500000, min=40000, max=1500000),
+                IO.DynamicCombo.Input(
+                    "generate_type",
+                    options=[
+                        IO.DynamicCombo.Option("Normal", [IO.Boolean.Input("pbr", default=False)]),
+                        IO.DynamicCombo.Option(
+                            "LowPoly",
+                            [
+                                IO.Combo.Input("polygon_type", options=["triangle", "quadrilateral"]),
+                                IO.Boolean.Input("pbr", default=False),
+                            ],
+                        ),
+                        IO.DynamicCombo.Option("Geometry", []),
+                    ],
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DOBJ.Output(display_name="OBJ"),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            is_output_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(
+                    widgets=["generate_type", "generate_type.pbr", "face_count"],
+                    inputs=["image_left", "image_right", "image_back"],
+                ),
+                expr="""
+                (
+                  $base := widgets.generate_type = "normal" ? 25 : widgets.generate_type = "lowpoly" ? 30 : 15;
+                  $multiview := (
+                    inputs.image_left.connected or inputs.image_right.connected or inputs.image_back.connected
+                  ) ? 10 : 0;
+                  $pbr := $lookup(widgets, "generate_type.pbr") ? 10 : 0;
+                  $face := widgets.face_count != 500000 ? 10 : 0;
+                  {"type":"usd","usd": ($base + $multiview + $pbr + $face) * 0.02}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: Input.Image,
+        face_count: int,
+        generate_type: InputGenerateType,
+        seed: int,
+        image_left: Input.Image | None = None,
+        image_right: Input.Image | None = None,
+        image_back: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        _ = seed
+        if model == "3.1" and generate_type["generate_type"].lower() == "lowpoly":
+            raise ValueError("The LowPoly option is currently unavailable for the 3.1 model.")
+        validate_image_dimensions(image, min_width=128, min_height=128)
+        multiview_images = []
+        for k, v in {
+            "left": image_left,
+            "right": image_right,
+            "back": image_back,
+        }.items():
+            if v is None:
+                continue
+            validate_image_dimensions(v, min_width=128, min_height=128)
+            multiview_images.append(
+                Hunyuan3DViewImage(
+                    ViewType=k,
+                    ViewImageUrl=await upload_image_to_comfyapi(
+                        cls,
+                        downscale_image_tensor_by_max_side(v, max_side=4900),
+                        mime_type="image/webp",
+                        total_pixels=24_010_000,
+                    ),
+                )
+            )
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro", method="POST"),
+            response_model=To3DProTaskCreateResponse,
+            data=To3DProTaskRequest(
+                Model=model,
+                FaceCount=face_count,
+                GenerateType=generate_type["generate_type"],
+                ImageUrl=await upload_image_to_comfyapi(
+                    cls,
+                    downscale_image_tensor_by_max_side(image, max_side=4900),
+                    mime_type="image/webp",
+                    total_pixels=24_010_000,
+                ),
+                MultiViewImages=multiview_images if multiview_images else None,
+                EnablePBR=generate_type.get("pbr", None),
+                PolygonType=generate_type.get("polygon_type", None),
+            ),
+        )
+        if response.Error:
+            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
+        task_id = response.JobId
+        result = await poll_op(
+            cls,
+            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro/query", method="POST"),
+            data=To3DProTaskQueryRequest(JobId=task_id),
+            response_model=To3DProTaskResultResponse,
+            status_extractor=lambda r: r.Status,
+        )
+        glb_result = get_file_from_response(result.ResultFile3Ds, "glb")
+        obj_result = get_file_from_response(result.ResultFile3Ds, "obj")
+        file_glb = await download_url_to_file_3d(glb_result.Url, "glb", task_id=task_id) if glb_result else None
+        return IO.NodeOutput(
+            file_glb, file_glb, await download_url_to_file_3d(obj_result.Url, "obj", task_id=task_id) if obj_result else None
+        )
+
+
+class TencentHunyuan3DExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            TencentTextToModelNode,
+            TencentImageToModelNode,
+        ]
+
+
+async def comfy_entrypoint() -> TencentHunyuan3DExtension:
+    return TencentHunyuan3DExtension()
@@ -249,7 +249,6 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusRe
        ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"),
        response_model=TaskStatusResponse,
        status_extractor=lambda r: (r.data.task_status if r.data else None),
-        max_poll_attempts=160,
    )
    return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))

@@ -0,0 +1,889 @@
+import math
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.magnific import (
+    ImageRelightAdvancedSettingsRequest,
+    ImageRelightRequest,
+    ImageSkinEnhancerCreativeRequest,
+    ImageSkinEnhancerFaithfulRequest,
+    ImageSkinEnhancerFlexibleRequest,
+    ImageStyleTransferRequest,
+    ImageUpscalerCreativeRequest,
+    ImageUpscalerPrecisionV2Request,
+    InputAdvancedSettings,
+    InputPortraitMode,
+    InputSkinEnhancerMode,
+    TaskResponse,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_image_tensor,
+    downscale_image_tensor,
+    get_image_dimensions,
+    get_number_of_images,
+    poll_op,
+    sync_op,
+    upload_images_to_comfyapi,
+    validate_image_aspect_ratio,
+    validate_image_dimensions,
+)
+
+
+class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="MagnificImageUpscalerCreativeNode",
+            display_name="Magnific Image Upscale (Creative)",
+            category="api node/image/Magnific",
+            description="Prompt‑guided enhancement, stylization, and 2x/4x/8x/16x upscaling. "
+            "Maximum output: 25.3 megapixels.",
+            inputs=[
+                IO.Image.Input("image"),
+                IO.String.Input("prompt", multiline=True, default=""),
+                IO.Combo.Input("scale_factor", options=["2x", "4x", "8x", "16x"]),
+                IO.Combo.Input(
+                    "optimized_for",
+                    options=[
+                        "standard",
+                        "soft_portraits",
+                        "hard_portraits",
+                        "art_n_illustration",
+                        "videogame_assets",
+                        "nature_n_landscapes",
+                        "films_n_photography",
+                        "3d_renders",
+                        "science_fiction_n_horror",
+                    ],
+                ),
+                IO.Int.Input("creativity", min=-10, max=10, default=0, display_mode=IO.NumberDisplay.slider),
+                IO.Int.Input(
+                    "hdr",
+                    min=-10,
+                    max=10,
+                    default=0,
+                    tooltip="The level of definition and detail.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "resemblance",
+                    min=-10,
+                    max=10,
+                    default=0,
+                    tooltip="The level of resemblance to the original image.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "fractality",
+                    min=-10,
+                    max=10,
+                    default=0,
+                    tooltip="The strength of the prompt and intricacy per square pixel.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Combo.Input(
+                    "engine",
+                    options=["automatic", "magnific_illusio", "magnific_sharpy", "magnific_sparkle"],
+                ),
+                IO.Boolean.Input(
+                    "auto_downscale",
+                    default=False,
+                    tooltip="Automatically downscale input image if output would exceed maximum pixel limit.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]),
+                expr="""
+                (
+                  $max := widgets.scale_factor = "2x" ? 1.326 : 1.657;
+                  {"type": "range_usd", "min_usd": 0.11, "max_usd": $max}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        image: Input.Image,
+        prompt: str,
+        scale_factor: str,
+        optimized_for: str,
+        creativity: int,
+        hdr: int,
+        resemblance: int,
+        fractality: int,
+        engine: str,
+        auto_downscale: bool,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False)
+        validate_image_dimensions(image, min_height=160, min_width=160)
+
+        max_output_pixels = 25_300_000
+        height, width = get_image_dimensions(image)
+        requested_scale = int(scale_factor.rstrip("x"))
+        output_pixels = height * width * requested_scale * requested_scale
+
+        if output_pixels > max_output_pixels:
+            if auto_downscale:
+                # Find optimal scale factor that doesn't require >2x downscale.
+                # Server upscales in 2x steps, so aggressive downscaling degrades quality.
+                input_pixels = width * height
+                scale = 2
+                max_input_pixels = max_output_pixels // 4
+                for candidate in [16, 8, 4, 2]:
+                    if candidate > requested_scale:
+                        continue
+                    scale_output_pixels = input_pixels * candidate * candidate
+                    if scale_output_pixels <= max_output_pixels:
+                        scale = candidate
+                        max_input_pixels = None
+                        break
+                    downscale_ratio = math.sqrt(scale_output_pixels / max_output_pixels)
+                    if downscale_ratio <= 2.0:
+                        scale = candidate
+                        max_input_pixels = max_output_pixels // (candidate * candidate)
+                        break
+
+                if max_input_pixels is not None:
+                    image = downscale_image_tensor(image, total_pixels=max_input_pixels)
+                scale_factor = f"{scale}x"
+            else:
+                raise ValueError(
+                    f"Output size ({width * requested_scale}x{height * requested_scale} = {output_pixels:,} pixels) "
+                    f"exceeds maximum allowed size of {max_output_pixels:,} pixels. "
+                    f"Use a smaller input image or lower scale factor."
+                )
+
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler", method="POST"),
+            response_model=TaskResponse,
+            data=ImageUpscalerCreativeRequest(
+                image=(await upload_images_to_comfyapi(cls, image, max_images=1, total_pixels=None))[0],
+                scale_factor=scale_factor,
+                optimized_for=optimized_for,
+                creativity=creativity,
+                hdr=hdr,
+                resemblance=resemblance,
+                fractality=fractality,
+                engine=engine,
+                prompt=prompt if prompt else None,
+            ),
+        )
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler/{initial_res.task_id}"),
+            response_model=TaskResponse,
+            status_extractor=lambda x: x.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
+
+
+class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="MagnificImageUpscalerPreciseV2Node",
+            display_name="Magnific Image Upscale (Precise V2)",
+            category="api node/image/Magnific",
+            description="High-fidelity upscaling with fine control over sharpness, grain, and detail. "
+            "Maximum output: 10060×10060 pixels.",
+            inputs=[
+                IO.Image.Input("image"),
+                IO.Combo.Input("scale_factor", options=["2x", "4x", "8x", "16x"]),
+                IO.Combo.Input(
+                    "flavor",
+                    options=["sublime", "photo", "photo_denoiser"],
+                    tooltip="Processing style: "
+                    "sublime for general use, photo for photographs, photo_denoiser for noisy photos.",
+                ),
+                IO.Int.Input(
+                    "sharpen",
+                    min=0,
+                    max=100,
+                    default=7,
+                    tooltip="Image sharpness intensity. Higher values increase edge definition and clarity.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "smart_grain",
+                    min=0,
+                    max=100,
+                    default=7,
+                    tooltip="Intelligent grain/texture enhancement to prevent the image from "
+                    "looking too smooth or artificial.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "ultra_detail",
+                    min=0,
+                    max=100,
+                    default=30,
+                    tooltip="Controls fine detail, textures, and micro-details added during upscaling.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Boolean.Input(
+                    "auto_downscale",
+                    default=False,
+                    tooltip="Automatically downscale input image if output would exceed maximum resolution.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]),
+                expr="""
+                (
+                  $max := widgets.scale_factor = "2x" ? 1.326 : 1.657;
+                  {"type": "range_usd", "min_usd": 0.11, "max_usd": $max}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        image: Input.Image,
+        scale_factor: str,
+        flavor: str,
+        sharpen: int,
+        smart_grain: int,
+        ultra_detail: int,
+        auto_downscale: bool,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False)
+        validate_image_dimensions(image, min_height=160, min_width=160)
+
+        max_output_dimension = 10060
+        height, width = get_image_dimensions(image)
+        requested_scale = int(scale_factor.strip("x"))
+        output_width = width * requested_scale
+        output_height = height * requested_scale
+
+        if output_width > max_output_dimension or output_height > max_output_dimension:
+            if auto_downscale:
+                # Find optimal scale factor that doesn't require >2x downscale.
+                # Server upscales in 2x steps, so aggressive downscaling degrades quality.
+                max_dim = max(width, height)
+                scale = 2
+                max_input_dim = max_output_dimension // 2
+                scale_ratio = max_input_dim / max_dim
+                max_input_pixels = int(width * height * scale_ratio * scale_ratio)
+                for candidate in [16, 8, 4, 2]:
+                    if candidate > requested_scale:
+                        continue
+                    output_dim = max_dim * candidate
+                    if output_dim <= max_output_dimension:
+                        scale = candidate
+                        max_input_pixels = None
+                        break
+                    downscale_ratio = output_dim / max_output_dimension
+                    if downscale_ratio <= 2.0:
+                        scale = candidate
+                        max_input_dim = max_output_dimension // candidate
+                        scale_ratio = max_input_dim / max_dim
+                        max_input_pixels = int(width * height * scale_ratio * scale_ratio)
+                        break
+
+                if max_input_pixels is not None:
+                    image = downscale_image_tensor(image, total_pixels=max_input_pixels)
+                requested_scale = scale
+            else:
+                raise ValueError(
+                    f"Output dimensions ({output_width}x{output_height}) exceed maximum allowed "
+                    f"resolution of {max_output_dimension}x{max_output_dimension} pixels. "
+                    f"Use a smaller input image or lower scale factor."
+                )
+
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler-precision-v2", method="POST"),
+            response_model=TaskResponse,
+            data=ImageUpscalerPrecisionV2Request(
+                image=(await upload_images_to_comfyapi(cls, image, max_images=1, total_pixels=None))[0],
+                scale_factor=requested_scale,
+                flavor=flavor,
+                sharpen=sharpen,
+                smart_grain=smart_grain,
+                ultra_detail=ultra_detail,
+            ),
+        )
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler-precision-v2/{initial_res.task_id}"),
+            response_model=TaskResponse,
+            status_extractor=lambda x: x.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
+
+
+class MagnificImageStyleTransferNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="MagnificImageStyleTransferNode",
+            display_name="Magnific Image Style Transfer",
+            category="api node/image/Magnific",
+            description="Transfer the style from a reference image to your input image.",
+            inputs=[
+                IO.Image.Input("image", tooltip="The image to apply style transfer to."),
+                IO.Image.Input("reference_image", tooltip="The reference image to extract style from."),
+                IO.String.Input("prompt", multiline=True, default=""),
+                IO.Int.Input(
+                    "style_strength",
+                    min=0,
+                    max=100,
+                    default=100,
+                    tooltip="Percentage of style strength.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "structure_strength",
+                    min=0,
+                    max=100,
+                    default=50,
+                    tooltip="Maintains the structure of the original image.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Combo.Input(
+                    "flavor",
+                    options=["faithful", "gen_z", "psychedelia", "detaily", "clear", "donotstyle", "donotstyle_sharp"],
+                    tooltip="Style transfer flavor.",
+                ),
+                IO.Combo.Input(
+                    "engine",
+                    options=[
+                        "balanced",
+                        "definio",
+                        "illusio",
+                        "3d_cartoon",
+                        "colorful_anime",
+                        "caricature",
+                        "real",
+                        "super_real",
+                        "softy",
+                    ],
+                    tooltip="Processing engine selection.",
+                ),
+                IO.DynamicCombo.Input(
+                    "portrait_mode",
+                    options=[
+                        IO.DynamicCombo.Option("disabled", []),
+                        IO.DynamicCombo.Option(
+                            "enabled",
+                            [
+                                IO.Combo.Input(
+                                    "portrait_style",
+                                    options=["standard", "pop", "super_pop"],
+                                    tooltip="Visual style applied to portrait images.",
+                                ),
+                                IO.Combo.Input(
+                                    "portrait_beautifier",
+                                    options=["none", "beautify_face", "beautify_face_max"],
+                                    tooltip="Facial beautification intensity on portraits.",
+                                ),
+                            ],
+                        ),
+                    ],
+                    tooltip="Enable portrait mode for facial enhancements.",
+                ),
+                IO.Boolean.Input(
+                    "fixed_generation",
+                    default=True,
+                    tooltip="When disabled, expect each generation to introduce a degree of randomness, "
+                    "leading to more diverse outcomes.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.11}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        image: Input.Image,
+        reference_image: Input.Image,
+        prompt: str,
+        style_strength: int,
+        structure_strength: int,
+        flavor: str,
+        engine: str,
+        portrait_mode: InputPortraitMode,
+        fixed_generation: bool,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        if get_number_of_images(reference_image) != 1:
+            raise ValueError("Exactly one reference image is required.")
+        validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False)
+        validate_image_aspect_ratio(reference_image, (1, 3), (3, 1), strict=False)
+        validate_image_dimensions(image, min_height=160, min_width=160)
+        validate_image_dimensions(reference_image, min_height=160, min_width=160)
+
+        is_portrait = portrait_mode["portrait_mode"] == "enabled"
+        portrait_style = portrait_mode.get("portrait_style", "standard")
+        portrait_beautifier = portrait_mode.get("portrait_beautifier", "none")
+
+        uploaded_urls = await upload_images_to_comfyapi(cls, [image, reference_image], max_images=2)
+
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/freepik/v1/ai/image-style-transfer", method="POST"),
+            response_model=TaskResponse,
+            data=ImageStyleTransferRequest(
+                image=uploaded_urls[0],
+                reference_image=uploaded_urls[1],
+                prompt=prompt if prompt else None,
+                style_strength=style_strength,
+                structure_strength=structure_strength,
+                is_portrait=is_portrait,
+                portrait_style=portrait_style if is_portrait else None,
+                portrait_beautifier=portrait_beautifier if is_portrait and portrait_beautifier != "none" else None,
+                flavor=flavor,
+                engine=engine,
+                fixed_generation=fixed_generation,
+            ),
+        )
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-style-transfer/{initial_res.task_id}"),
+            response_model=TaskResponse,
+            status_extractor=lambda x: x.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
+
+
+class MagnificImageRelightNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="MagnificImageRelightNode",
+            display_name="Magnific Image Relight",
+            category="api node/image/Magnific",
+            description="Relight an image with lighting adjustments and optional reference-based light transfer.",
+            inputs=[
+                IO.Image.Input("image", tooltip="The image to relight."),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Descriptive guidance for lighting. Supports emphasis notation (1-1.4).",
+                ),
+                IO.Int.Input(
+                    "light_transfer_strength",
+                    min=0,
+                    max=100,
+                    default=100,
+                    tooltip="Intensity of light transfer application.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Combo.Input(
+                    "style",
+                    options=[
+                        "standard",
+                        "darker_but_realistic",
+                        "clean",
+                        "smooth",
+                        "brighter",
+                        "contrasted_n_hdr",
+                        "just_composition",
+                    ],
+                    tooltip="Stylistic output preference.",
+                ),
+                IO.Boolean.Input(
+                    "interpolate_from_original",
+                    default=False,
+                    tooltip="Restricts generation freedom to match original more closely.",
+                ),
+                IO.Boolean.Input(
+                    "change_background",
+                    default=True,
+                    tooltip="Modifies background based on prompt/reference.",
+                ),
+                IO.Boolean.Input(
+                    "preserve_details",
+                    default=True,
+                    tooltip="Maintains texture and fine details from original.",
+                ),
+                IO.DynamicCombo.Input(
+                    "advanced_settings",
+                    options=[
+                        IO.DynamicCombo.Option("disabled", []),
+                        IO.DynamicCombo.Option(
+                            "enabled",
+                            [
+                                IO.Int.Input(
+                                    "whites",
+                                    min=0,
+                                    max=100,
+                                    default=50,
+                                    tooltip="Adjusts the brightest tones in the image.",
+                                    display_mode=IO.NumberDisplay.slider,
+                                ),
+                                IO.Int.Input(
+                                    "blacks",
+                                    min=0,
+                                    max=100,
+                                    default=50,
+                                    tooltip="Adjusts the darkest tones in the image.",
+                                    display_mode=IO.NumberDisplay.slider,
+                                ),
+                                IO.Int.Input(
+                                    "brightness",
+                                    min=0,
+                                    max=100,
+                                    default=50,
+                                    tooltip="Overall brightness adjustment.",
+                                    display_mode=IO.NumberDisplay.slider,
+                                ),
+                                IO.Int.Input(
+                                    "contrast",
+                                    min=0,
+                                    max=100,
+                                    default=50,
+                                    tooltip="Contrast adjustment.",
+                                    display_mode=IO.NumberDisplay.slider,
+                                ),
+                                IO.Int.Input(
+                                    "saturation",
+                                    min=0,
+                                    max=100,
+                                    default=50,
+                                    tooltip="Color saturation adjustment.",
+                                    display_mode=IO.NumberDisplay.slider,
+                                ),
+                                IO.Combo.Input(
+                                    "engine",
+                                    options=[
+                                        "automatic",
+                                        "balanced",
+                                        "cool",
+                                        "real",
+                                        "illusio",
+                                        "fairy",
+                                        "colorful_anime",
+                                        "hard_transform",
+                                        "softy",
+                                    ],
+                                    tooltip="Processing engine selection.",
+                                ),
+                                IO.Combo.Input(
+                                    "transfer_light_a",
+                                    options=["automatic", "low", "medium", "normal", "high", "high_on_faces"],
+                                    tooltip="The intensity of light transfer.",
+                                ),
+                                IO.Combo.Input(
+                                    "transfer_light_b",
+                                    options=[
+                                        "automatic",
+                                        "composition",
+                                        "straight",
+                                        "smooth_in",
+                                        "smooth_out",
+                                        "smooth_both",
+                                        "reverse_both",
+                                        "soft_in",
+                                        "soft_out",
+                                        "soft_mid",
+                                        # "strong_mid",  # Commented out because requests fail when this is set.
+                                        "style_shift",
+                                        "strong_shift",
+                                    ],
+                                    tooltip="Also modifies light transfer intensity. "
+                                    "Can be combined with the previous control for varied effects.",
+                                ),
+                                IO.Boolean.Input(
+                                    "fixed_generation",
+                                    default=True,
+                                    tooltip="Ensures consistent output with the same settings.",
+                                ),
+                            ],
+                        ),
+                    ],
+                    tooltip="Fine-tuning options for advanced lighting control.",
+                ),
+                IO.Image.Input(
+                    "reference_image",
+                    optional=True,
+                    tooltip="Optional reference image to transfer lighting from.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.11}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        image: Input.Image,
+        prompt: str,
+        light_transfer_strength: int,
+        style: str,
+        interpolate_from_original: bool,
+        change_background: bool,
+        preserve_details: bool,
+        advanced_settings: InputAdvancedSettings,
+        reference_image: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        if reference_image is not None and get_number_of_images(reference_image) != 1:
+            raise ValueError("Exactly one reference image is required.")
+        validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False)
+        validate_image_dimensions(image, min_height=160, min_width=160)
+        if reference_image is not None:
+            validate_image_aspect_ratio(reference_image, (1, 3), (3, 1), strict=False)
+            validate_image_dimensions(reference_image, min_height=160, min_width=160)
+
+        image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0]
+        reference_url = None
+        if reference_image is not None:
+            reference_url = (await upload_images_to_comfyapi(cls, reference_image, max_images=1))[0]
+
+        adv_settings = None
+        if advanced_settings["advanced_settings"] == "enabled":
+            adv_settings = ImageRelightAdvancedSettingsRequest(
+                whites=advanced_settings["whites"],
+                blacks=advanced_settings["blacks"],
+                brightness=advanced_settings["brightness"],
+                contrast=advanced_settings["contrast"],
+                saturation=advanced_settings["saturation"],
+                engine=advanced_settings["engine"],
+                transfer_light_a=advanced_settings["transfer_light_a"],
+                transfer_light_b=advanced_settings["transfer_light_b"],
+                fixed_generation=advanced_settings["fixed_generation"],
+            )
+
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/freepik/v1/ai/image-relight", method="POST"),
+            response_model=TaskResponse,
+            data=ImageRelightRequest(
+                image=image_url,
+                prompt=prompt if prompt else None,
+                transfer_light_from_reference_image=reference_url,
+                light_transfer_strength=light_transfer_strength,
+                interpolate_from_original=interpolate_from_original,
+                change_background=change_background,
+                style=style,
+                preserve_details=preserve_details,
+                advanced_settings=adv_settings,
+            ),
+        )
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-relight/{initial_res.task_id}"),
+            response_model=TaskResponse,
+            status_extractor=lambda x: x.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
+
+
+class MagnificImageSkinEnhancerNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="MagnificImageSkinEnhancerNode",
+            display_name="Magnific Image Skin Enhancer",
+            category="api node/image/Magnific",
+            description="Skin enhancement for portraits with multiple processing modes.",
+            inputs=[
+                IO.Image.Input("image", tooltip="The portrait image to enhance."),
+                IO.Int.Input(
+                    "sharpen",
+                    min=0,
+                    max=100,
+                    default=0,
+                    tooltip="Sharpening intensity level.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "smart_grain",
+                    min=0,
+                    max=100,
+                    default=2,
+                    tooltip="Smart grain intensity level.",
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.DynamicCombo.Input(
+                    "mode",
+                    options=[
+                        IO.DynamicCombo.Option("creative", []),
+                        IO.DynamicCombo.Option(
+                            "faithful",
+                            [
+                                IO.Int.Input(
+                                    "skin_detail",
+                                    min=0,
+                                    max=100,
+                                    default=80,
+                                    tooltip="Skin detail enhancement level.",
+                                    display_mode=IO.NumberDisplay.slider,
+                                ),
+                            ],
+                        ),
+                        IO.DynamicCombo.Option(
+                            "flexible",
+                            [
+                                IO.Combo.Input(
+                                    "optimized_for",
+                                    options=[
+                                        "enhance_skin",
+                                        "improve_lighting",
+                                        "enhance_everything",
+                                        "transform_to_real",
+                                        "no_make_up",
+                                    ],
+                                    tooltip="Enhancement optimization target.",
+                                ),
+                            ],
+                        ),
+                    ],
+                    tooltip="Processing mode: creative for artistic enhancement, "
+                    "faithful for preserving original appearance, "
+                    "flexible for targeted optimization.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["mode"]),
+                expr="""
+                (
+                  $rates := {"creative": 0.29, "faithful": 0.37, "flexible": 0.45};
+                  {"type":"usd","usd": $lookup($rates, widgets.mode)}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        image: Input.Image,
+        sharpen: int,
+        smart_grain: int,
+        mode: InputSkinEnhancerMode,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False)
+        validate_image_dimensions(image, min_height=160, min_width=160)
+
+        image_url = (await upload_images_to_comfyapi(cls, image, max_images=1, total_pixels=4096 * 4096))[0]
+        selected_mode = mode["mode"]
+
+        if selected_mode == "creative":
+            endpoint = "creative"
+            data = ImageSkinEnhancerCreativeRequest(
+                image=image_url,
+                sharpen=sharpen,
+                smart_grain=smart_grain,
+            )
+        elif selected_mode == "faithful":
+            endpoint = "faithful"
+            data = ImageSkinEnhancerFaithfulRequest(
+                image=image_url,
+                sharpen=sharpen,
+                smart_grain=smart_grain,
+                skin_detail=mode["skin_detail"],
+            )
+        else:  # flexible
+            endpoint = "flexible"
+            data = ImageSkinEnhancerFlexibleRequest(
+                image=image_url,
+                sharpen=sharpen,
+                smart_grain=smart_grain,
+                optimized_for=mode["optimized_for"],
+            )
+
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/freepik/v1/ai/skin-enhancer/{endpoint}", method="POST"),
+            response_model=TaskResponse,
+            data=data,
+        )
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/freepik/v1/ai/skin-enhancer/{initial_res.task_id}"),
+            response_model=TaskResponse,
+            status_extractor=lambda x: x.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
+
+
+class MagnificExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            # MagnificImageUpscalerCreativeNode,
+            # MagnificImageUpscalerPreciseV2Node,
+            MagnificImageStyleTransferNode,
+            MagnificImageRelightNode,
+            MagnificImageSkinEnhancerNode,
+        ]
+
+
+async def comfy_entrypoint() -> MagnificExtension:
+    return MagnificExtension()
@@ -1,5 +1,3 @@
-import os
-
 from typing_extensions import override

 from comfy_api.latest import IO, ComfyExtension, Input
@@ -20,13 +18,12 @@ from comfy_api_nodes.apis.meshy import (
 )
 from comfy_api_nodes.util import (
    ApiEndpoint,
-    download_url_to_bytesio,
+    download_url_to_file_3d,
    poll_op,
    sync_op,
    upload_images_to_comfyapi,
    validate_string,
 )
-from folder_paths import get_output_directory


 class MeshyTextToModelNode(IO.ComfyNode):
@@ -79,8 +76,10 @@ class MeshyTextToModelNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MESHY_TASK_ID").Output(display_name="meshy_task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -122,16 +121,20 @@ class MeshyTextToModelNode(IO.ComfyNode):
                seed=seed,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v2/text-to-3d/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v2/text-to-3d/{task_id}"),
            response_model=MeshyModelResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(result.model_urls.glb, os.path.join(get_output_directory(), model_file))
-        return IO.NodeOutput(model_file, response.result)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            task_id,
+            await download_url_to_file_3d(result.model_urls.glb, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.model_urls.fbx, "fbx", task_id=task_id),
+        )


 class MeshyRefineNode(IO.ComfyNode):
@@ -167,8 +170,10 @@ class MeshyRefineNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MESHY_TASK_ID").Output(display_name="meshy_task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -210,16 +215,20 @@ class MeshyRefineNode(IO.ComfyNode):
                ai_model=model,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v2/text-to-3d/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v2/text-to-3d/{task_id}"),
            response_model=MeshyModelResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(result.model_urls.glb, os.path.join(get_output_directory(), model_file))
-        return IO.NodeOutput(model_file, response.result)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            task_id,
+            await download_url_to_file_3d(result.model_urls.glb, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.model_urls.fbx, "fbx", task_id=task_id),
+        )


 class MeshyImageToModelNode(IO.ComfyNode):
@@ -303,8 +312,10 @@ class MeshyImageToModelNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MESHY_TASK_ID").Output(display_name="meshy_task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -368,16 +379,20 @@ class MeshyImageToModelNode(IO.ComfyNode):
                seed=seed,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/image-to-3d/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/image-to-3d/{task_id}"),
            response_model=MeshyModelResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(result.model_urls.glb, os.path.join(get_output_directory(), model_file))
-        return IO.NodeOutput(model_file, response.result)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            task_id,
+            await download_url_to_file_3d(result.model_urls.glb, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.model_urls.fbx, "fbx", task_id=task_id),
+        )


 class MeshyMultiImageToModelNode(IO.ComfyNode):
@@ -464,8 +479,10 @@ class MeshyMultiImageToModelNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MESHY_TASK_ID").Output(display_name="meshy_task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -531,16 +548,20 @@ class MeshyMultiImageToModelNode(IO.ComfyNode):
                seed=seed,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/multi-image-to-3d/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/multi-image-to-3d/{task_id}"),
            response_model=MeshyModelResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(result.model_urls.glb, os.path.join(get_output_directory(), model_file))
-        return IO.NodeOutput(model_file, response.result)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            task_id,
+            await download_url_to_file_3d(result.model_urls.glb, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.model_urls.fbx, "fbx", task_id=task_id),
+        )


 class MeshyRigModelNode(IO.ComfyNode):
@@ -571,8 +592,10 @@ class MeshyRigModelNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MESHY_RIGGED_TASK_ID").Output(display_name="rig_task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -606,18 +629,20 @@ class MeshyRigModelNode(IO.ComfyNode):
                texture_image_url=texture_image_url,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/rigging/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/rigging/{task_id}"),
            response_model=MeshyRiggedResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(
-            result.result.rigged_character_glb_url, os.path.join(get_output_directory(), model_file)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            task_id,
+            await download_url_to_file_3d(result.result.rigged_character_glb_url, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.result.rigged_character_fbx_url, "fbx", task_id=task_id),
        )
-        return IO.NodeOutput(model_file, response.result)


 class MeshyAnimateModelNode(IO.ComfyNode):
@@ -640,7 +665,9 @@ class MeshyAnimateModelNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -669,16 +696,19 @@ class MeshyAnimateModelNode(IO.ComfyNode):
                action_id=action_id,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/animations/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/animations/{task_id}"),
            response_model=MeshyAnimationResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(result.result.animation_glb_url, os.path.join(get_output_directory(), model_file))
-        return IO.NodeOutput(model_file, response.result)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            await download_url_to_file_3d(result.result.animation_glb_url, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.result.animation_fbx_url, "fbx", task_id=task_id),
+        )


 class MeshyTextureNode(IO.ComfyNode):
@@ -715,8 +745,10 @@ class MeshyTextureNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MODEL_TASK_ID").Output(display_name="meshy_task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
+                IO.File3DFBX.Output(display_name="FBX"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -760,16 +792,20 @@ class MeshyTextureNode(IO.ComfyNode):
                image_style_url=image_style_url,
            ),
        )
+        task_id = response.result
        result = await poll_op(
            cls,
-            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/retexture/{response.result}"),
+            ApiEndpoint(path=f"/proxy/meshy/openapi/v1/retexture/{task_id}"),
            response_model=MeshyModelResult,
            status_extractor=lambda r: r.status,
            progress_extractor=lambda r: r.progress,
        )
-        model_file = f"meshy_model_{response.result}.glb"
-        await download_url_to_bytesio(result.model_urls.glb, os.path.join(get_output_directory(), model_file))
-        return IO.NodeOutput(model_file, response.result)
+        return IO.NodeOutput(
+            f"{task_id}.glb",
+            task_id,
+            await download_url_to_file_3d(result.model_urls.glb, "glb", task_id=task_id),
+            await download_url_to_file_3d(result.model_urls.fbx, "fbx", task_id=task_id),
+        )


 class MeshyExtension(ComfyExtension):
@@ -364,9 +364,9 @@ class OpenAIGPTImage1(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="OpenAIGPTImage1",
-            display_name="OpenAI GPT Image 1",
+            display_name="OpenAI GPT Image 1.5",
            category="api node/image/OpenAI",
-            description="Generates images synchronously via OpenAI's GPT Image 1 endpoint.",
+            description="Generates images synchronously via OpenAI's GPT Image endpoint.",
            inputs=[
                IO.String.Input(
                    "prompt",
@@ -429,6 +429,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
                IO.Combo.Input(
                    "model",
                    options=["gpt-image-1", "gpt-image-1.5"],
+                    default="gpt-image-1.5",
                    optional=True,
                ),
            ],
@@ -12,6 +12,8 @@ from comfy_api_nodes.apis.recraft import (
    RecraftColor,
    RecraftColorChain,
    RecraftControls,
+    RecraftCreateStyleRequest,
+    RecraftCreateStyleResponse,
    RecraftImageGenerationRequest,
    RecraftImageGenerationResponse,
    RecraftImageSize,
@@ -323,6 +325,75 @@ class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode):
        return IO.NodeOutput(RecraftStyle(style_id=style_id))


+class RecraftCreateStyleNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="RecraftCreateStyleNode",
+            display_name="Recraft Create Style",
+            category="api node/image/Recraft",
+            description="Create a custom style from reference images. "
+            "Upload 1-5 images to use as style references. "
+            "Total size of all images is limited to 5 MB.",
+            inputs=[
+                IO.Combo.Input(
+                    "style",
+                    options=["realistic_image", "digital_illustration"],
+                    tooltip="The base style of the generated images.",
+                ),
+                IO.Autogrow.Input(
+                    "images",
+                    template=IO.Autogrow.TemplatePrefix(
+                        IO.Image.Input("image"),
+                        prefix="image",
+                        min=1,
+                        max=5,
+                    ),
+                ),
+            ],
+            outputs=[
+                IO.String.Output(display_name="style_id"),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd": 0.04}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        style: str,
+        images: IO.Autogrow.Type,
+    ) -> IO.NodeOutput:
+        files = []
+        total_size = 0
+        max_total_size = 5 * 1024 * 1024  # 5 MB limit
+        for i, img in enumerate(list(images.values())):
+            file_bytes = tensor_to_bytesio(img, total_pixels=2048 * 2048, mime_type="image/webp").read()
+            total_size += len(file_bytes)
+            if total_size > max_total_size:
+                raise Exception("Total size of all images exceeds 5 MB limit.")
+            files.append((f"file{i + 1}", file_bytes))
+
+        response = await sync_op(
+            cls,
+            endpoint=ApiEndpoint(path="/proxy/recraft/styles", method="POST"),
+            response_model=RecraftCreateStyleResponse,
+            files=files,
+            data=RecraftCreateStyleRequest(style=style),
+            content_type="multipart/form-data",
+            max_retries=1,
+        )
+
+        return IO.NodeOutput(response.id)
+
+
 class RecraftTextToImageNode(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -395,7 +466,7 @@ class RecraftTextToImageNode(IO.ComfyNode):
        negative_prompt: str = None,
        recraft_controls: RecraftControls = None,
    ) -> IO.NodeOutput:
-        validate_string(prompt, strip_whitespace=False, max_length=1000)
+        validate_string(prompt, strip_whitespace=False, min_length=1, max_length=1000)
        default_style = RecraftStyle(RecraftStyleV3.realistic_image)
        if recraft_style is None:
            recraft_style = default_style
@@ -1024,6 +1095,7 @@ class RecraftExtension(ComfyExtension):
            RecraftStyleV3DigitalIllustrationNode,
            RecraftStyleV3LogoRasterNode,
            RecraftStyleInfiniteStyleLibrary,
+            RecraftCreateStyleNode,
            RecraftColorRGBNode,
            RecraftControlsNode,
        ]
@@ -10,7 +10,6 @@ import folder_paths as comfy_paths
 import os
 import logging
 import math
-from typing import Optional
 from io import BytesIO
 from typing_extensions import override
 from PIL import Image
@@ -28,8 +27,9 @@ from comfy_api_nodes.util import (
    poll_op,
    ApiEndpoint,
    download_url_to_bytesio,
+    download_url_to_file_3d,
 )
-from comfy_api.latest import ComfyExtension, IO
+from comfy_api.latest import ComfyExtension, IO, Types


 COMMON_PARAMETERS = [
@@ -177,7 +177,7 @@ def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str:
        return "DONE"
    return "Generating"

-def extract_progress(response: Rodin3DCheckStatusResponse) -> Optional[int]:
+def extract_progress(response: Rodin3DCheckStatusResponse) -> int | None:
    if not response.jobs:
        return None
    completed_count = sum(1 for job in response.jobs if job.status == JobStatus.Done)
@@ -207,17 +207,25 @@ async def get_rodin_download_list(uuid: str, cls: type[IO.ComfyNode]) -> Rodin3D
    )


-async def download_files(url_list, task_uuid: str):
+async def download_files(url_list, task_uuid: str) -> tuple[str | None, Types.File3D | None]:
    result_folder_name = f"Rodin3D_{task_uuid}"
    save_path = os.path.join(comfy_paths.get_output_directory(), result_folder_name)
    os.makedirs(save_path, exist_ok=True)
    model_file_path = None
+    file_3d = None
+
    for i in url_list.list:
        file_path = os.path.join(save_path, i.name)
-        if file_path.endswith(".glb"):
+        if i.name.lower().endswith(".glb"):
            model_file_path = os.path.join(result_folder_name, i.name)
-        await download_url_to_bytesio(i.url, file_path)
-    return model_file_path
+            file_3d = await download_url_to_file_3d(i.url, "glb")
+            # Save to disk for backward compatibility
+            with open(file_path, "wb") as f:
+                f.write(file_3d.get_bytes())
+        else:
+            await download_url_to_bytesio(i.url, file_path)
+
+    return model_file_path, file_3d


 class Rodin3D_Regular(IO.ComfyNode):
@@ -234,7 +242,10 @@ class Rodin3D_Regular(IO.ComfyNode):
                IO.Image.Input("Images"),
                *COMMON_PARAMETERS,
            ],
-            outputs=[IO.String.Output(display_name="3D Model Path")],
+            outputs=[
+                IO.String.Output(display_name="3D Model Path"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
@@ -271,9 +282,9 @@ class Rodin3D_Regular(IO.ComfyNode):
        )
        await poll_for_task_status(subscription_key, cls)
        download_list = await get_rodin_download_list(task_uuid, cls)
-        model = await download_files(download_list, task_uuid)
+        model_path, file_3d = await download_files(download_list, task_uuid)

-        return IO.NodeOutput(model)
+        return IO.NodeOutput(model_path, file_3d)


 class Rodin3D_Detail(IO.ComfyNode):
@@ -290,7 +301,10 @@ class Rodin3D_Detail(IO.ComfyNode):
                IO.Image.Input("Images"),
                *COMMON_PARAMETERS,
            ],
-            outputs=[IO.String.Output(display_name="3D Model Path")],
+            outputs=[
+                IO.String.Output(display_name="3D Model Path"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
@@ -327,9 +341,9 @@ class Rodin3D_Detail(IO.ComfyNode):
        )
        await poll_for_task_status(subscription_key, cls)
        download_list = await get_rodin_download_list(task_uuid, cls)
-        model = await download_files(download_list, task_uuid)
+        model_path, file_3d = await download_files(download_list, task_uuid)

-        return IO.NodeOutput(model)
+        return IO.NodeOutput(model_path, file_3d)


 class Rodin3D_Smooth(IO.ComfyNode):
@@ -346,7 +360,10 @@ class Rodin3D_Smooth(IO.ComfyNode):
                IO.Image.Input("Images"),
                *COMMON_PARAMETERS,
            ],
-            outputs=[IO.String.Output(display_name="3D Model Path")],
+            outputs=[
+                IO.String.Output(display_name="3D Model Path"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
@@ -382,9 +399,9 @@ class Rodin3D_Smooth(IO.ComfyNode):
        )
        await poll_for_task_status(subscription_key, cls)
        download_list = await get_rodin_download_list(task_uuid, cls)
-        model = await download_files(download_list, task_uuid)
+        model_path, file_3d = await download_files(download_list, task_uuid)

-        return IO.NodeOutput(model)
+        return IO.NodeOutput(model_path, file_3d)


 class Rodin3D_Sketch(IO.ComfyNode):
@@ -408,7 +425,10 @@ class Rodin3D_Sketch(IO.ComfyNode):
                    optional=True,
                ),
            ],
-            outputs=[IO.String.Output(display_name="3D Model Path")],
+            outputs=[
+                IO.String.Output(display_name="3D Model Path"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
@@ -441,9 +461,9 @@ class Rodin3D_Sketch(IO.ComfyNode):
        )
        await poll_for_task_status(subscription_key, cls)
        download_list = await get_rodin_download_list(task_uuid, cls)
-        model = await download_files(download_list, task_uuid)
+        model_path, file_3d = await download_files(download_list, task_uuid)

-        return IO.NodeOutput(model)
+        return IO.NodeOutput(model_path, file_3d)


 class Rodin3D_Gen2(IO.ComfyNode):
@@ -475,7 +495,10 @@ class Rodin3D_Gen2(IO.ComfyNode):
                ),
                IO.Boolean.Input("TAPose", default=False),
            ],
-            outputs=[IO.String.Output(display_name="3D Model Path")],
+            outputs=[
+                IO.String.Output(display_name="3D Model Path"),  # for backward compatibility only
+                IO.File3DGLB.Output(display_name="GLB"),
+            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
@@ -511,9 +534,9 @@ class Rodin3D_Gen2(IO.ComfyNode):
        )
        await poll_for_task_status(subscription_key, cls)
        download_list = await get_rodin_download_list(task_uuid, cls)
-        model = await download_files(download_list, task_uuid)
+        model_path, file_3d = await download_files(download_list, task_uuid)

-        return IO.NodeOutput(model)
+        return IO.NodeOutput(model_path, file_3d)


 class Rodin3DExtension(ComfyExtension):
@@ -149,7 +149,6 @@ class OpenAIVideoSora2(IO.ComfyNode):
            response_model=Sora2GenerationResponse,
            status_extractor=lambda x: x.status,
            poll_interval=8.0,
-            max_poll_attempts=160,
            estimated_duration=int(45 * (duration / 4) * model_time_multiplier),
        )
        return IO.NodeOutput(
@@ -203,7 +203,6 @@ class TopazImageEnhance(IO.ComfyNode):
            progress_extractor=lambda x: getattr(x, "progress", 0),
            price_extractor=lambda x: x.credits * 0.08,
            poll_interval=8.0,
-            max_poll_attempts=160,
            estimated_duration=60,
        )

@@ -1,10 +1,6 @@
-import os
-from typing import Optional
-
-import torch
 from typing_extensions import override

-from comfy_api.latest import IO, ComfyExtension
+from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis.tripo import (
    TripoAnimateRetargetRequest,
    TripoAnimateRigRequest,
@@ -26,12 +22,11 @@ from comfy_api_nodes.apis.tripo import (
 )
 from comfy_api_nodes.util import (
    ApiEndpoint,
-    download_url_as_bytesio,
+    download_url_to_file_3d,
    poll_op,
    sync_op,
    upload_images_to_comfyapi,
 )
-from folder_paths import get_output_directory


 def get_model_url_from_response(response: TripoTaskResponse) -> str:
@@ -45,7 +40,7 @@ def get_model_url_from_response(response: TripoTaskResponse) -> str:
 async def poll_until_finished(
    node_cls: type[IO.ComfyNode],
    response: TripoTaskResponse,
-    average_duration: Optional[int] = None,
+    average_duration: int | None = None,
 ) -> IO.NodeOutput:
    """Polls the Tripo API endpoint until the task reaches a terminal state, then returns the response."""
    if response.code != 0:
@@ -69,12 +64,8 @@ async def poll_until_finished(
    )
    if response_poll.data.status == TripoTaskStatus.SUCCESS:
        url = get_model_url_from_response(response_poll)
-        bytesio = await download_url_as_bytesio(url)
-        # Save the downloaded model file
-        model_file = f"tripo_model_{task_id}.glb"
-        with open(os.path.join(get_output_directory(), model_file), "wb") as f:
-            f.write(bytesio.getvalue())
-        return IO.NodeOutput(model_file, task_id)
+        file_glb = await download_url_to_file_3d(url, "glb", task_id=task_id)
+        return IO.NodeOutput(f"{task_id}.glb", task_id, file_glb)
    raise RuntimeError(f"Failed to generate mesh: {response_poll}")


@@ -107,8 +98,9 @@ class TripoTextToModelNode(IO.ComfyNode):
                IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -155,18 +147,18 @@ class TripoTextToModelNode(IO.ComfyNode):
    async def execute(
        cls,
        prompt: str,
-        negative_prompt: Optional[str] = None,
+        negative_prompt: str | None = None,
        model_version=None,
-        style: Optional[str] = None,
-        texture: Optional[bool] = None,
-        pbr: Optional[bool] = None,
-        image_seed: Optional[int] = None,
-        model_seed: Optional[int] = None,
-        texture_seed: Optional[int] = None,
-        texture_quality: Optional[str] = None,
-        geometry_quality: Optional[str] = None,
-        face_limit: Optional[int] = None,
-        quad: Optional[bool] = None,
+        style: str | None = None,
+        texture: bool | None = None,
+        pbr: bool | None = None,
+        image_seed: int | None = None,
+        model_seed: int | None = None,
+        texture_seed: int | None = None,
+        texture_quality: str | None = None,
+        geometry_quality: str | None = None,
+        face_limit: int | None = None,
+        quad: bool | None = None,
    ) -> IO.NodeOutput:
        style_enum = None if style == "None" else style
        if not prompt:
@@ -232,8 +224,9 @@ class TripoImageToModelNode(IO.ComfyNode):
                IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -279,19 +272,19 @@ class TripoImageToModelNode(IO.ComfyNode):
    @classmethod
    async def execute(
        cls,
-        image: torch.Tensor,
-        model_version: Optional[str] = None,
-        style: Optional[str] = None,
-        texture: Optional[bool] = None,
-        pbr: Optional[bool] = None,
-        model_seed: Optional[int] = None,
+        image: Input.Image,
+        model_version: str | None = None,
+        style: str | None = None,
+        texture: bool | None = None,
+        pbr: bool | None = None,
+        model_seed: int | None = None,
        orientation=None,
-        texture_seed: Optional[int] = None,
-        texture_quality: Optional[str] = None,
-        geometry_quality: Optional[str] = None,
-        texture_alignment: Optional[str] = None,
-        face_limit: Optional[int] = None,
-        quad: Optional[bool] = None,
+        texture_seed: int | None = None,
+        texture_quality: str | None = None,
+        geometry_quality: str | None = None,
+        texture_alignment: str | None = None,
+        face_limit: int | None = None,
+        quad: bool | None = None,
    ) -> IO.NodeOutput:
        style_enum = None if style == "None" else style
        if image is None:
@@ -368,8 +361,9 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
                IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -411,21 +405,21 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
    @classmethod
    async def execute(
        cls,
-        image: torch.Tensor,
-        image_left: Optional[torch.Tensor] = None,
-        image_back: Optional[torch.Tensor] = None,
-        image_right: Optional[torch.Tensor] = None,
-        model_version: Optional[str] = None,
-        orientation: Optional[str] = None,
-        texture: Optional[bool] = None,
-        pbr: Optional[bool] = None,
-        model_seed: Optional[int] = None,
-        texture_seed: Optional[int] = None,
-        texture_quality: Optional[str] = None,
-        geometry_quality: Optional[str] = None,
-        texture_alignment: Optional[str] = None,
-        face_limit: Optional[int] = None,
-        quad: Optional[bool] = None,
+        image: Input.Image,
+        image_left: Input.Image | None = None,
+        image_back: Input.Image | None = None,
+        image_right: Input.Image | None = None,
+        model_version: str | None = None,
+        orientation: str | None = None,
+        texture: bool | None = None,
+        pbr: bool | None = None,
+        model_seed: int | None = None,
+        texture_seed: int | None = None,
+        texture_quality: str | None = None,
+        geometry_quality: str | None = None,
+        texture_alignment: str | None = None,
+        face_limit: int | None = None,
+        quad: bool | None = None,
    ) -> IO.NodeOutput:
        if image is None:
            raise RuntimeError("front image for multiview is required")
@@ -487,8 +481,9 @@ class TripoTextureNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -512,11 +507,11 @@ class TripoTextureNode(IO.ComfyNode):
    async def execute(
        cls,
        model_task_id,
-        texture: Optional[bool] = None,
-        pbr: Optional[bool] = None,
-        texture_seed: Optional[int] = None,
-        texture_quality: Optional[str] = None,
-        texture_alignment: Optional[str] = None,
+        texture: bool | None = None,
+        pbr: bool | None = None,
+        texture_seed: int | None = None,
+        texture_quality: str | None = None,
+        texture_alignment: str | None = None,
    ) -> IO.NodeOutput:
        response = await sync_op(
            cls,
@@ -547,8 +542,9 @@ class TripoRefineNode(IO.ComfyNode):
                IO.Custom("MODEL_TASK_ID").Input("model_task_id", tooltip="Must be a v1.4 Tripo model"),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -583,8 +579,9 @@ class TripoRigNode(IO.ComfyNode):
            category="api node/3d/Tripo",
            inputs=[IO.Custom("MODEL_TASK_ID").Input("original_model_task_id")],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("RIG_TASK_ID").Output(display_name="rig task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -642,8 +639,9 @@ class TripoRetargetNode(IO.ComfyNode):
                ),
            ],
            outputs=[
-                IO.String.Output(display_name="model_file"),
+                IO.String.Output(display_name="model_file"),  # for backward compatibility only
                IO.Custom("RETARGET_TASK_ID").Output(display_name="retarget task_id"),
+                IO.File3DGLB.Output(display_name="GLB"),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
@@ -2,9 +2,12 @@ from typing_extensions import override

 from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis.vidu import (
+    FrameSetting,
    SubjectReference,
    TaskCreationRequest,
    TaskCreationResponse,
+    TaskExtendCreationRequest,
+    TaskMultiFrameCreationRequest,
    TaskResult,
    TaskStatusResponse,
 )
@@ -14,11 +17,14 @@ from comfy_api_nodes.util import (
    get_number_of_images,
    poll_op,
    sync_op,
+    upload_image_to_comfyapi,
    upload_images_to_comfyapi,
+    upload_video_to_comfyapi,
    validate_image_aspect_ratio,
    validate_image_dimensions,
    validate_images_aspect_ratio_closeness,
    validate_string,
+    validate_video_duration,
 )

 VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video"
@@ -31,7 +37,8 @@ VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations"
 async def execute_task(
    cls: type[IO.ComfyNode],
    vidu_endpoint: str,
-    payload: TaskCreationRequest,
+    payload: TaskCreationRequest | TaskExtendCreationRequest | TaskMultiFrameCreationRequest,
+    max_poll_attempts: int = 320,
 ) -> list[TaskResult]:
    task_creation_response = await sync_op(
        cls,
@@ -47,7 +54,7 @@ async def execute_task(
        response_model=TaskStatusResponse,
        status_extractor=lambda r: r.state,
        progress_extractor=lambda r: r.progress,
-        max_poll_attempts=320,
+        max_poll_attempts=max_poll_attempts,
    )
    if not response.creations:
        raise RuntimeError(
@@ -703,7 +710,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode):
                    "subjects",
                    template=IO.Autogrow.TemplateNames(
                        IO.Image.Input("reference_images"),
-                        names=["subject1", "subject2", "subject3"],
+                        names=["subject1", "subject2", "subject3", "subject4", "subject5", "subject6", "subject7"],
                        min=1,
                    ),
                    tooltip="For each subject, provide up to 3 reference images (7 images total across all subjects). "
@@ -738,7 +745,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode):
                    control_after_generate=True,
                ),
                IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "4:3", "3:4", "1:1"]),
-                IO.Combo.Input("resolution", options=["720p"]),
+                IO.Combo.Input("resolution", options=["720p", "1080p"]),
                IO.Combo.Input(
                    "movement_amplitude",
                    options=["auto", "small", "medium", "large"],
@@ -940,6 +947,540 @@ class Vidu2StartEndToVideoNode(IO.ComfyNode):
        return IO.NodeOutput(await download_url_to_video_output(results[0].url))


+class ViduExtendVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="ViduExtendVideoNode",
+            display_name="Vidu Video Extension",
+            category="api node/video/Vidu",
+            description="Extend an existing video by generating additional frames.",
+            inputs=[
+                IO.DynamicCombo.Input(
+                    "model",
+                    options=[
+                        IO.DynamicCombo.Option(
+                            "viduq2-pro",
+                            [
+                                IO.Int.Input(
+                                    "duration",
+                                    default=4,
+                                    min=1,
+                                    max=7,
+                                    step=1,
+                                    display_mode=IO.NumberDisplay.slider,
+                                    tooltip="Duration of the extended video in seconds.",
+                                ),
+                                IO.Combo.Input(
+                                    "resolution",
+                                    options=["720p", "1080p"],
+                                    tooltip="Resolution of the output video.",
+                                ),
+                            ],
+                        ),
+                        IO.DynamicCombo.Option(
+                            "viduq2-turbo",
+                            [
+                                IO.Int.Input(
+                                    "duration",
+                                    default=4,
+                                    min=1,
+                                    max=7,
+                                    step=1,
+                                    display_mode=IO.NumberDisplay.slider,
+                                    tooltip="Duration of the extended video in seconds.",
+                                ),
+                                IO.Combo.Input(
+                                    "resolution",
+                                    options=["720p", "1080p"],
+                                    tooltip="Resolution of the output video.",
+                                ),
+                            ],
+                        ),
+                    ],
+                    tooltip="Model to use for video extension.",
+                ),
+                IO.Video.Input(
+                    "video",
+                    tooltip="The source video to extend.",
+                ),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="An optional text prompt for the extended video (max 2000 characters).",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=1,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                ),
+                IO.Image.Input("end_frame", optional=True),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model", "model.duration", "model.resolution"]),
+                expr="""
+                (
+                  $m := widgets.model;
+                  $d := $lookup(widgets, "model.duration");
+                  $res := $lookup(widgets, "model.resolution");
+                  $contains($m, "pro")
+                    ? (
+                        $base := $lookup({"720p": 0.15, "1080p": 0.3}, $res);
+                        $perSec := $lookup({"720p": 0.05, "1080p": 0.075}, $res);
+                        {"type":"usd","usd": $base + $perSec * ($d - 1)}
+                      )
+                    : (
+                        $base := $lookup({"720p": 0.075, "1080p": 0.2}, $res);
+                        $perSec := $lookup({"720p": 0.025, "1080p": 0.05}, $res);
+                        {"type":"usd","usd": $base + $perSec * ($d - 1)}
+                      )
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: dict,
+        video: Input.Video,
+        prompt: str,
+        seed: int,
+        end_frame: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, max_length=2000)
+        validate_video_duration(video, min_duration=4, max_duration=55)
+        image_url = None
+        if end_frame is not None:
+            validate_image_aspect_ratio(end_frame, (1, 4), (4, 1))
+            validate_image_dimensions(end_frame, min_width=128, min_height=128)
+            image_url = await upload_image_to_comfyapi(cls, end_frame, wait_label="Uploading end frame")
+        results = await execute_task(
+            cls,
+            "/proxy/vidu/extend",
+            TaskExtendCreationRequest(
+                model=model["model"],
+                prompt=prompt,
+                duration=model["duration"],
+                seed=seed,
+                resolution=model["resolution"],
+                video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading video"),
+                images=[image_url] if image_url else None,
+            ),
+            max_poll_attempts=480,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(results[0].url))
+
+
+def _generate_frame_inputs(count: int) -> list:
+    """Generate input widgets for a given number of frames."""
+    inputs = []
+    for i in range(1, count + 1):
+        inputs.extend(
+            [
+                IO.String.Input(
+                    f"prompt{i}",
+                    multiline=True,
+                    default="",
+                    tooltip=f"Text prompt for frame {i} transition.",
+                ),
+                IO.Image.Input(
+                    f"end_image{i}",
+                    tooltip=f"End frame image for segment {i}. Aspect ratio must be between 1:4 and 4:1.",
+                ),
+                IO.Int.Input(
+                    f"duration{i}",
+                    default=4,
+                    min=2,
+                    max=7,
+                    step=1,
+                    display_mode=IO.NumberDisplay.slider,
+                    tooltip=f"Duration for segment {i} in seconds.",
+                ),
+            ]
+        )
+    return inputs
+
+
+class ViduMultiFrameVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="ViduMultiFrameVideoNode",
+            display_name="Vidu Multi-Frame Video Generation",
+            category="api node/video/Vidu",
+            description="Generate a video with multiple keyframe transitions.",
+            inputs=[
+                IO.Combo.Input("model", options=["viduq2-pro", "viduq2-turbo"]),
+                IO.Image.Input(
+                    "start_image",
+                    tooltip="The starting frame image. Aspect ratio must be between 1:4 and 4:1.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=1,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                ),
+                IO.Combo.Input("resolution", options=["720p", "1080p"]),
+                IO.DynamicCombo.Input(
+                    "frames",
+                    options=[
+                        IO.DynamicCombo.Option("2", _generate_frame_inputs(2)),
+                        IO.DynamicCombo.Option("3", _generate_frame_inputs(3)),
+                        IO.DynamicCombo.Option("4", _generate_frame_inputs(4)),
+                        IO.DynamicCombo.Option("5", _generate_frame_inputs(5)),
+                        IO.DynamicCombo.Option("6", _generate_frame_inputs(6)),
+                        IO.DynamicCombo.Option("7", _generate_frame_inputs(7)),
+                        IO.DynamicCombo.Option("8", _generate_frame_inputs(8)),
+                        IO.DynamicCombo.Option("9", _generate_frame_inputs(9)),
+                    ],
+                    tooltip="Number of keyframe transitions (2-9).",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(
+                    widgets=[
+                        "model",
+                        "resolution",
+                        "frames",
+                        "frames.duration1",
+                        "frames.duration2",
+                        "frames.duration3",
+                        "frames.duration4",
+                        "frames.duration5",
+                        "frames.duration6",
+                        "frames.duration7",
+                        "frames.duration8",
+                        "frames.duration9",
+                    ]
+                ),
+                expr="""
+                (
+                  $m := widgets.model;
+                  $n := $number(widgets.frames);
+                  $is1080 := widgets.resolution = "1080p";
+                  $d1 := $lookup(widgets, "frames.duration1");
+                  $d2 := $lookup(widgets, "frames.duration2");
+                  $d3 := $n >= 3 ? $lookup(widgets, "frames.duration3") : 0;
+                  $d4 := $n >= 4 ? $lookup(widgets, "frames.duration4") : 0;
+                  $d5 := $n >= 5 ? $lookup(widgets, "frames.duration5") : 0;
+                  $d6 := $n >= 6 ? $lookup(widgets, "frames.duration6") : 0;
+                  $d7 := $n >= 7 ? $lookup(widgets, "frames.duration7") : 0;
+                  $d8 := $n >= 8 ? $lookup(widgets, "frames.duration8") : 0;
+                  $d9 := $n >= 9 ? $lookup(widgets, "frames.duration9") : 0;
+                  $totalDuration := $d1 + $d2 + $d3 + $d4 + $d5 + $d6 + $d7 + $d8 + $d9;
+                  $contains($m, "pro")
+                    ? (
+                        $base := $is1080 ? 0.3 : 0.15;
+                        $perSec := $is1080 ? 0.075 : 0.05;
+                        {"type":"usd","usd": $n * $base + $perSec * $totalDuration}
+                      )
+                    : (
+                        $base := $is1080 ? 0.2 : 0.075;
+                        $perSec := $is1080 ? 0.05 : 0.025;
+                        {"type":"usd","usd": $n * $base + $perSec * $totalDuration}
+                      )
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        start_image: Input.Image,
+        seed: int,
+        resolution: str,
+        frames: dict,
+    ) -> IO.NodeOutput:
+        validate_image_aspect_ratio(start_image, (1, 4), (4, 1))
+        frame_count = int(frames["frames"])
+        image_settings: list[FrameSetting] = []
+        for i in range(1, frame_count + 1):
+            validate_image_aspect_ratio(frames[f"end_image{i}"], (1, 4), (4, 1))
+            validate_string(frames[f"prompt{i}"], max_length=2000)
+        start_image_url = await upload_image_to_comfyapi(
+            cls,
+            start_image,
+            mime_type="image/png",
+            wait_label="Uploading start image",
+        )
+        for i in range(1, frame_count + 1):
+            image_settings.append(
+                FrameSetting(
+                    prompt=frames[f"prompt{i}"],
+                    key_image=await upload_image_to_comfyapi(
+                        cls,
+                        frames[f"end_image{i}"],
+                        mime_type="image/png",
+                        wait_label=f"Uploading end image({i})",
+                    ),
+                    duration=frames[f"duration{i}"],
+                )
+            )
+        results = await execute_task(
+            cls,
+            "/proxy/vidu/multiframe",
+            TaskMultiFrameCreationRequest(
+                model=model,
+                seed=seed,
+                resolution=resolution,
+                start_image=start_image_url,
+                image_settings=image_settings,
+            ),
+            max_poll_attempts=480 * frame_count,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(results[0].url))
+
+
+class Vidu3TextToVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="Vidu3TextToVideoNode",
+            display_name="Vidu Q3 Text-to-Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from a text prompt.",
+            inputs=[
+                IO.DynamicCombo.Input(
+                    "model",
+                    options=[
+                        IO.DynamicCombo.Option(
+                            "viduq3-pro",
+                            [
+                                IO.Combo.Input(
+                                    "aspect_ratio",
+                                    options=["16:9", "9:16", "3:4", "4:3", "1:1"],
+                                    tooltip="The aspect ratio of the output video.",
+                                ),
+                                IO.Combo.Input(
+                                    "resolution",
+                                    options=["720p", "1080p"],
+                                    tooltip="Resolution of the output video.",
+                                ),
+                                IO.Int.Input(
+                                    "duration",
+                                    default=5,
+                                    min=1,
+                                    max=16,
+                                    step=1,
+                                    display_mode=IO.NumberDisplay.slider,
+                                    tooltip="Duration of the output video in seconds.",
+                                ),
+                                IO.Boolean.Input(
+                                    "audio",
+                                    default=False,
+                                    tooltip="When enabled, outputs video with sound "
+                                    "(including dialogue and sound effects).",
+                                ),
+                            ],
+                        ),
+                    ],
+                    tooltip="Model to use for video generation.",
+                ),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation, with a maximum length of 2000 characters.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=1,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model.duration", "model.resolution"]),
+                expr="""
+                (
+                  $res := $lookup(widgets, "model.resolution");
+                  $base := $lookup({"720p": 0.075, "1080p": 0.1}, $res);
+                  $perSec := $lookup({"720p": 0.025, "1080p": 0.05}, $res);
+                  {"type":"usd","usd": $base + $perSec * ($lookup(widgets, "model.duration") - 1)}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: dict,
+        prompt: str,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=2000)
+        results = await execute_task(
+            cls,
+            VIDU_TEXT_TO_VIDEO,
+            TaskCreationRequest(
+                model=model["model"],
+                prompt=prompt,
+                duration=model["duration"],
+                seed=seed,
+                aspect_ratio=model["aspect_ratio"],
+                resolution=model["resolution"],
+                audio=model["audio"],
+            ),
+            max_poll_attempts=640,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(results[0].url))
+
+
+class Vidu3ImageToVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="Vidu3ImageToVideoNode",
+            display_name="Vidu Q3 Image-to-Video Generation",
+            category="api node/video/Vidu",
+            description="Generate a video from an image and an optional prompt.",
+            inputs=[
+                IO.DynamicCombo.Input(
+                    "model",
+                    options=[
+                        IO.DynamicCombo.Option(
+                            "viduq3-pro",
+                            [
+                                IO.Combo.Input(
+                                    "resolution",
+                                    options=["720p", "1080p", "2K"],
+                                    tooltip="Resolution of the output video.",
+                                ),
+                                IO.Int.Input(
+                                    "duration",
+                                    default=5,
+                                    min=1,
+                                    max=16,
+                                    step=1,
+                                    display_mode=IO.NumberDisplay.slider,
+                                    tooltip="Duration of the output video in seconds.",
+                                ),
+                                IO.Boolean.Input(
+                                    "audio",
+                                    default=False,
+                                    tooltip="When enabled, outputs video with sound "
+                                    "(including dialogue and sound effects).",
+                                ),
+                            ],
+                        ),
+                    ],
+                    tooltip="Model to use for video generation.",
+                ),
+                IO.Image.Input(
+                    "image",
+                    tooltip="An image to be used as the start frame of the generated video.",
+                ),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="An optional text prompt for video generation (max 2000 characters).",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=1,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model.duration", "model.resolution"]),
+                expr="""
+                (
+                  $res := $lookup(widgets, "model.resolution");
+                  $base := $lookup({"720p": 0.075, "1080p": 0.275, "2k": 0.35}, $res);
+                  $perSec := $lookup({"720p": 0.05, "1080p": 0.075, "2k": 0.075}, $res);
+                  {"type":"usd","usd": $base + $perSec * ($lookup(widgets, "model.duration") - 1)}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: dict,
+        image: Input.Image,
+        prompt: str,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_image_aspect_ratio(image, (1, 4), (4, 1))
+        validate_string(prompt, max_length=2000)
+        results = await execute_task(
+            cls,
+            VIDU_IMAGE_TO_VIDEO,
+            TaskCreationRequest(
+                model=model["model"],
+                prompt=prompt,
+                duration=model["duration"],
+                seed=seed,
+                resolution=model["resolution"],
+                audio=model["audio"],
+                images=[await upload_image_to_comfyapi(cls, image)],
+            ),
+            max_poll_attempts=720,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(results[0].url))
+
+
 class ViduExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -952,6 +1493,10 @@ class ViduExtension(ComfyExtension):
            Vidu2ImageToVideoNode,
            Vidu2ReferenceVideoNode,
            Vidu2StartEndToVideoNode,
+            ViduExtendVideoNode,
+            ViduMultiFrameVideoNode,
+            Vidu3TextToVideoNode,
+            Vidu3ImageToVideoNode,
        ]


@@ -0,0 +1,178 @@
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.wavespeed import (
+    FlashVSRRequest,
+    TaskCreatedResponse,
+    TaskResultResponse,
+    SeedVR2ImageRequest,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_video_output,
+    poll_op,
+    sync_op,
+    upload_video_to_comfyapi,
+    validate_container_format_is_mp4,
+    validate_video_duration,
+    upload_images_to_comfyapi,
+    get_number_of_images,
+    download_url_to_image_tensor,
+)
+
+
+class WavespeedFlashVSRNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="WavespeedFlashVSRNode",
+            display_name="FlashVSR Video Upscale",
+            category="api node/video/WaveSpeed",
+            description="Fast, high-quality video upscaler that "
+            "boosts resolution and restores clarity for low-resolution or blurry footage.",
+            inputs=[
+                IO.Video.Input("video"),
+                IO.Combo.Input("target_resolution", options=["720p", "1080p", "2K", "4K"]),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["target_resolution"]),
+                expr="""
+                (
+                  $price_for_1sec := {"720p": 0.012, "1080p": 0.018, "2k": 0.024, "4k": 0.032};
+                  {
+                    "type":"usd",
+                    "usd": $lookup($price_for_1sec, widgets.target_resolution),
+                    "format":{"suffix": "/second", "approximate": true}
+                  }
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        target_resolution: str,
+    ) -> IO.NodeOutput:
+        validate_container_format_is_mp4(video)
+        validate_video_duration(video, min_duration=5, max_duration=60 * 10)
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/wavespeed/api/v3/wavespeed-ai/flashvsr", method="POST"),
+            response_model=TaskCreatedResponse,
+            data=FlashVSRRequest(
+                target_resolution=target_resolution.lower(),
+                video=await upload_video_to_comfyapi(cls, video),
+                duration=video.get_duration(),
+            ),
+        )
+        if initial_res.code != 200:
+            raise ValueError(f"Task creation fails with code={initial_res.code} and message={initial_res.message}")
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wavespeed/api/v3/predictions/{initial_res.data.id}/result"),
+            response_model=TaskResultResponse,
+            status_extractor=lambda x: "failed" if x.data is None else x.data.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        if final_response.code != 200:
+            raise ValueError(
+                f"Task processing failed with code={final_response.code} and message={final_response.message}"
+            )
+        return IO.NodeOutput(await download_url_to_video_output(final_response.data.outputs[0]))
+
+
+class WavespeedImageUpscaleNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="WavespeedImageUpscaleNode",
+            display_name="WaveSpeed Image Upscale",
+            category="api node/image/WaveSpeed",
+            description="Boost image resolution and quality, upscaling photos to 4K or 8K for sharp, detailed results.",
+            inputs=[
+                IO.Combo.Input("model", options=["SeedVR2", "Ultimate"]),
+                IO.Image.Input("image"),
+                IO.Combo.Input("target_resolution", options=["2K", "4K", "8K"]),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+                expr="""
+                (
+                  $prices := {"seedvr2": 0.01, "ultimate": 0.06};
+                  {"type":"usd", "usd": $lookup($prices, widgets.model)}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: Input.Image,
+        target_resolution: str,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        if model == "SeedVR2":
+            model_path = "seedvr2/image"
+        else:
+            model_path = "ultimate-image-upscaler"
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wavespeed/api/v3/wavespeed-ai/{model_path}", method="POST"),
+            response_model=TaskCreatedResponse,
+            data=SeedVR2ImageRequest(
+                target_resolution=target_resolution.lower(),
+                image=(await upload_images_to_comfyapi(cls, image, max_images=1))[0],
+            ),
+        )
+        if initial_res.code != 200:
+            raise ValueError(f"Task creation fails with code={initial_res.code} and message={initial_res.message}")
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wavespeed/api/v3/predictions/{initial_res.data.id}/result"),
+            response_model=TaskResultResponse,
+            status_extractor=lambda x: "failed" if x.data is None else x.data.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        if final_response.code != 200:
+            raise ValueError(
+                f"Task processing failed with code={final_response.code} and message={final_response.message}"
+            )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.outputs[0]))
+
+
+class WavespeedExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            WavespeedFlashVSRNode,
+            WavespeedImageUpscaleNode,
+        ]
+
+
+async def comfy_entrypoint() -> WavespeedExtension:
+    return WavespeedExtension()
@@ -13,6 +13,7 @@ from .conversions import (
    bytesio_to_image_tensor,
    convert_mask_to_image,
    downscale_image_tensor,
+    downscale_image_tensor_by_max_side,
    image_tensor_pair_to_batch,
    pil_to_bytesio,
    resize_mask_to_image,
@@ -27,12 +28,14 @@ from .conversions import (
 from .download_helpers import (
    download_url_as_bytesio,
    download_url_to_bytesio,
+    download_url_to_file_3d,
    download_url_to_image_tensor,
    download_url_to_video_output,
 )
 from .upload_helpers import (
    upload_audio_to_comfyapi,
    upload_file_to_comfyapi,
+    upload_image_to_comfyapi,
    upload_images_to_comfyapi,
    upload_video_to_comfyapi,
 )
@@ -61,11 +64,13 @@ __all__ = [
    # Upload helpers
    "upload_audio_to_comfyapi",
    "upload_file_to_comfyapi",
+    "upload_image_to_comfyapi",
    "upload_images_to_comfyapi",
    "upload_video_to_comfyapi",
    # Download helpers
    "download_url_as_bytesio",
    "download_url_to_bytesio",
+    "download_url_to_file_3d",
    "download_url_to_image_tensor",
    "download_url_to_video_output",
    # Conversions
@@ -75,6 +80,7 @@ __all__ = [
    "bytesio_to_image_tensor",
    "convert_mask_to_image",
    "downscale_image_tensor",
+    "downscale_image_tensor_by_max_side",
    "image_tensor_pair_to_batch",
    "pil_to_bytesio",
    "resize_mask_to_image",
@@ -141,7 +141,7 @@ async def poll_op(
    queued_statuses: list[str | int] | None = None,
    data: BaseModel | None = None,
    poll_interval: float = 5.0,
-    max_poll_attempts: int = 120,
+    max_poll_attempts: int = 160,
    timeout_per_poll: float = 120.0,
    max_retries_per_poll: int = 3,
    retry_delay_per_poll: float = 1.0,
@@ -238,7 +238,7 @@ async def poll_op_raw(
    queued_statuses: list[str | int] | None = None,
    data: dict[str, Any] | BaseModel | None = None,
    poll_interval: float = 5.0,
-    max_poll_attempts: int = 120,
+    max_poll_attempts: int = 160,
    timeout_per_poll: float = 120.0,
    max_retries_per_poll: int = 3,
    retry_delay_per_poll: float = 1.0,
@@ -56,15 +56,14 @@ def image_tensor_pair_to_batch(image1: torch.Tensor, image2: torch.Tensor) -> to
 def tensor_to_bytesio(
    image: torch.Tensor,
    *,
-    total_pixels: int = 2048 * 2048,
+    total_pixels: int | None = 2048 * 2048,
    mime_type: str = "image/png",
 ) -> BytesIO:
    """Converts a torch.Tensor image to a named BytesIO object.

    Args:
        image: Input torch.Tensor image.
-        name: Optional filename for the BytesIO object.
-        total_pixels: Maximum total pixels for potential downscaling.
+        total_pixels: Maximum total pixels for downscaling. If None, no downscaling is performed.
        mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4').

    Returns:
@@ -79,13 +78,14 @@ def tensor_to_bytesio(
    return img_binary


-def tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image:
+def tensor_to_pil(image: torch.Tensor, total_pixels: int | None = 2048 * 2048) -> Image.Image:
    """Converts a single torch.Tensor image [H, W, C] to a PIL Image, optionally downscaling."""
    if len(image.shape) > 3:
        image = image[0]
    # TODO: remove alpha if not allowed and present
    input_tensor = image.cpu()
-    input_tensor = downscale_image_tensor(input_tensor.unsqueeze(0), total_pixels=total_pixels).squeeze()
+    if total_pixels is not None:
+        input_tensor = downscale_image_tensor(input_tensor.unsqueeze(0), total_pixels=total_pixels).squeeze()
    image_np = (input_tensor.numpy() * 255).astype(np.uint8)
    img = Image.fromarray(image_np)
    return img
@@ -93,14 +93,14 @@ def tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image

 def tensor_to_base64_string(
    image_tensor: torch.Tensor,
-    total_pixels: int = 2048 * 2048,
+    total_pixels: int | None = 2048 * 2048,
    mime_type: str = "image/png",
 ) -> str:
    """Convert [B, H, W, C] or [H, W, C] tensor to a base64 string.

    Args:
        image_tensor: Input torch.Tensor image.
-        total_pixels: Maximum total pixels for potential downscaling.
+        total_pixels: Maximum total pixels for downscaling. If None, no downscaling is performed.
        mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4').

    Returns:
@@ -144,16 +144,31 @@ def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024)
    return s


+def downscale_image_tensor_by_max_side(image: torch.Tensor, *,  max_side: int) -> torch.Tensor:
+    """Downscale input image tensor so the largest dimension is at most max_side pixels."""
+    samples = image.movedim(-1, 1)
+    height, width = samples.shape[2], samples.shape[3]
+    max_dim = max(width, height)
+    if max_dim <= max_side:
+        return image
+    scale_by = max_side / max_dim
+    new_width = round(width * scale_by)
+    new_height = round(height * scale_by)
+    s = common_upscale(samples, new_width, new_height, "lanczos", "disabled")
+    s = s.movedim(1, -1)
+    return s
+
+
 def tensor_to_data_uri(
    image_tensor: torch.Tensor,
-    total_pixels: int = 2048 * 2048,
+    total_pixels: int | None = 2048 * 2048,
    mime_type: str = "image/png",
 ) -> str:
    """Converts a tensor image to a Data URI string.

    Args:
        image_tensor: Input torch.Tensor image.
-        total_pixels: Maximum total pixels for potential downscaling.
+        total_pixels: Maximum total pixels for downscaling. If None, no downscaling is performed.
        mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp').

    Returns:
@@ -11,7 +11,8 @@ import torch
 from aiohttp.client_exceptions import ClientError, ContentTypeError

 from comfy_api.latest import IO as COMFY_IO
-from comfy_api.latest import InputImpl
+from comfy_api.latest import InputImpl, Types
+from folder_paths import get_output_directory

 from . import request_logger
 from ._helpers import (
@@ -261,3 +262,38 @@ def _generate_operation_id(method: str, url: str, attempt: int) -> str:
    except Exception:
        slug = "download"
    return f"{method}_{slug}_try{attempt}_{uuid.uuid4().hex[:8]}"
+
+
+async def download_url_to_file_3d(
+    url: str,
+    file_format: str,
+    *,
+    task_id: str | None = None,
+    timeout: float | None = None,
+    max_retries: int = 5,
+    cls: type[COMFY_IO.ComfyNode] = None,
+) -> Types.File3D:
+    """Downloads a 3D model file from a URL into memory as BytesIO.
+
+    If task_id is provided, also writes the file to disk in the output directory
+    for backward compatibility with the old save-to-disk behavior.
+    """
+    file_format = file_format.lstrip(".").lower()
+    data = BytesIO()
+    await download_url_to_bytesio(
+        url,
+        data,
+        timeout=timeout,
+        max_retries=max_retries,
+        cls=cls,
+    )
+
+    if task_id is not None:
+        # This is only for backward compatability with current behavior when every 3D node is output node
+        # All new API nodes should not use "task_id" and instead users should use "SaveGLB" node to save results
+        output_dir = Path(get_output_directory())
+        output_path = output_dir / f"{task_id}.{file_format}"
+        output_path.write_bytes(data.getvalue())
+        data.seek(0)
+
+    return Types.File3D(source=data, file_format=file_format)
@@ -49,7 +49,7 @@ async def upload_images_to_comfyapi(
    mime_type: str | None = None,
    wait_label: str | None = "Uploading",
    show_batch_index: bool = True,
-    total_pixels: int = 2048 * 2048,
+    total_pixels: int | None = 2048 * 2048,
 ) -> list[str]:
    """
    Uploads images to ComfyUI API and returns download URLs.
@@ -88,6 +88,28 @@ async def upload_images_to_comfyapi(
    return download_urls


+async def upload_image_to_comfyapi(
+    cls: type[IO.ComfyNode],
+    image: torch.Tensor,
+    *,
+    mime_type: str | None = None,
+    wait_label: str | None = "Uploading",
+    total_pixels: int | None = 2048 * 2048,
+) -> str:
+    """Uploads a single image to ComfyUI API and returns its download URL."""
+    return (
+        await upload_images_to_comfyapi(
+            cls,
+            image,
+            max_images=1,
+            mime_type=mime_type,
+            wait_label=wait_label,
+            show_batch_index=False,
+            total_pixels=total_pixels,
+        )
+    )[0]
+
+
 async def upload_audio_to_comfyapi(
    cls: type[IO.ComfyNode],
    audio: Input.Audio,
@@ -171,9 +171,10 @@ def get_outputs_summary(outputs: dict) -> tuple[int, Optional[dict]]:
                continue

            for item in items:
+                count += 1
+
                if not isinstance(item, dict):
                    continue
-                count += 1

                if preview_output is None and is_previewable(media_type, item):
                    enriched = {
@@ -28,12 +28,39 @@ class TextEncodeAceStepAudio(io.ComfyNode):
        conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
        return io.NodeOutput(conditioning)

+class TextEncodeAceStepAudio15(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeAceStepAudio1.5",
+            category="conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("tags", multiline=True, dynamic_prompts=True),
+                io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
+                io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
+                io.Int.Input("bpm", default=120, min=10, max=300),
+                io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
+                io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
+                io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
+                io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+            ],
+            outputs=[io.Conditioning.Output()],
+        )
+
+    @classmethod
+    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale) -> io.NodeOutput:
+        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        return io.NodeOutput(conditioning)
+

 class EmptyAceStepLatentAudio(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EmptyAceStepLatentAudio",
+            display_name="Empty Ace Step 1.0 Latent Audio",
            category="latent/audio",
            inputs=[
                io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
@@ -51,12 +78,60 @@ class EmptyAceStepLatentAudio(io.ComfyNode):
        return io.NodeOutput({"samples": latent, "type": "audio"})


+class EmptyAceStep15LatentAudio(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyAceStep1.5LatentAudio",
+            display_name="Empty Ace Step 1.5 Latent Audio",
+            category="latent/audio",
+            inputs=[
+                io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
+                io.Int.Input(
+                    "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
+                ),
+            ],
+            outputs=[io.Latent.Output()],
+        )
+
+    @classmethod
+    def execute(cls, seconds, batch_size) -> io.NodeOutput:
+        length = round((seconds * 48000 / 1920))
+        latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
+        return io.NodeOutput({"samples": latent, "type": "audio"})
+
+class ReferenceTimbreAudio(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ReferenceTimbreAudio",
+            category="advanced/conditioning/audio",
+            is_experimental=True,
+            description="This node sets the reference audio for timbre (for ace step 1.5)",
+            inputs=[
+                io.Conditioning.Input("conditioning"),
+                io.Latent.Input("latent", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, conditioning, latent=None) -> io.NodeOutput:
+        if latent is not None:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
+        return io.NodeOutput(conditioning)
+
 class AceExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            TextEncodeAceStepAudio,
            EmptyAceStepLatentAudio,
+            TextEncodeAceStepAudio15,
+            EmptyAceStep15LatentAudio,
+            ReferenceTimbreAudio,
        ]

 async def comfy_entrypoint() -> AceExtension:
@@ -28,6 +28,7 @@ class AlignYourStepsScheduler(io.ComfyNode):
    def define_schema(cls) -> io.Schema:
        return io.Schema(
            node_id="AlignYourStepsScheduler",
+            search_aliases=["AYS scheduler"],
            category="sampling/custom_sampling/schedulers",
            inputs=[
                io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]),
--- a/Show More
+++ b/Show More