From a286ab9df31eccb813fca0e3ccb182f608f25626 Mon Sep 17 00:00:00 2001 From: itazap Date: Tue, 27 Jan 2026 13:35:37 +0100 Subject: [PATCH 1/4] v5 tok update --- src/diffusers/loaders/textual_inversion.py | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index 63fc97ed431f..2faa10bf9f25 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -13,6 +13,9 @@ # limitations under the License. from typing import Dict, List, Optional, Union +import json +from tokenizers import Tokenizer as TokenizerFast + import safetensors import torch from huggingface_hub.utils import validate_hf_hub_args @@ -547,23 +550,20 @@ def unload_textual_inversion( else: last_special_token_id = added_token_id - # Delete from tokenizer - for token_id, token_to_remove in zip(token_ids, tokens): - del tokenizer._added_tokens_decoder[token_id] - del tokenizer._added_tokens_encoder[token_to_remove] - - # Make all token ids sequential in tokenizer - key_id = 1 - for token_id in tokenizer.added_tokens_decoder: - if token_id > last_special_token_id and token_id > last_special_token_id + key_id: - token = tokenizer._added_tokens_decoder[token_id] - tokenizer._added_tokens_decoder[last_special_token_id + key_id] = token - del tokenizer._added_tokens_decoder[token_id] - tokenizer._added_tokens_encoder[token.content] = last_special_token_id + key_id - key_id += 1 - tokenizer._update_trie() - # set correct total vocab size after removing tokens - tokenizer._update_total_vocab_size() + # Fast tokenizers: serialize, filter tokens, reload + tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) + + new_id = last_special_token_id + 1 + filtered = [] + for tok in tokenizer_json.get("added_tokens", []): + if tok.get("content") in set(tokens): + continue + if not tok.get("special", False): + tok["id"] = new_id + new_id += 1 + filtered.append(tok) + tokenizer_json["added_tokens"] = filtered + tokenizer._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) # Delete from text encoder text_embedding_dim = text_encoder.get_input_embeddings().embedding_dim From 14284cca2a9826719fe857a2cda9c82b6770b854 Mon Sep 17 00:00:00 2001 From: itazap Date: Tue, 27 Jan 2026 14:07:00 +0100 Subject: [PATCH 2/4] ruff --- src/diffusers/loaders/textual_inversion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index 2faa10bf9f25..43f03807470f 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union - import json -from tokenizers import Tokenizer as TokenizerFast +from typing import Dict, List, Optional, Union import safetensors import torch from huggingface_hub.utils import validate_hf_hub_args +from tokenizers import Tokenizer as TokenizerFast from torch import nn from ..models.modeling_utils import load_state_dict From 0c01cc12f2d22b3a6f8e83a1e7f96808c142899b Mon Sep 17 00:00:00 2001 From: itazap Date: Wed, 28 Jan 2026 12:01:07 +0100 Subject: [PATCH 3/4] keep pre v5 slow code path --- src/diffusers/loaders/textual_inversion.py | 47 +++++++++++++++------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index 43f03807470f..10d8218595ca 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -549,20 +549,39 @@ def unload_textual_inversion( else: last_special_token_id = added_token_id - # Fast tokenizers: serialize, filter tokens, reload - tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) - - new_id = last_special_token_id + 1 - filtered = [] - for tok in tokenizer_json.get("added_tokens", []): - if tok.get("content") in set(tokens): - continue - if not tok.get("special", False): - tok["id"] = new_id - new_id += 1 - filtered.append(tok) - tokenizer_json["added_tokens"] = filtered - tokenizer._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) + # Fast tokenizers (v5+) + if hasattr(tokenizer, "_tokenizer"): + # Fast tokenizers: serialize, filter tokens, reload + tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) + new_id = last_special_token_id + 1 + filtered = [] + for tok in tokenizer_json.get("added_tokens", []): + if tok.get("content") in set(tokens): + continue + if not tok.get("special", False): + tok["id"] = new_id + new_id += 1 + filtered.append(tok) + tokenizer_json["added_tokens"] = filtered + tokenizer._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) + else: + # Slow tokenizers + for token_id, token_to_remove in zip(token_ids, tokens): + del tokenizer._added_tokens_decoder[token_id] + del tokenizer._added_tokens_encoder[token_to_remove] + + key_id = 1 + for token_id in list(tokenizer.added_tokens_decoder.keys()): + if token_id > last_special_token_id and token_id > last_special_token_id + key_id: + token = tokenizer._added_tokens_decoder[token_id] + tokenizer._added_tokens_decoder[last_special_token_id + key_id] = token + del tokenizer._added_tokens_decoder[token_id] + tokenizer._added_tokens_encoder[token.content] = last_special_token_id + key_id + key_id += 1 + if hasattr(tokenizer, "_update_trie"): + tokenizer._update_trie() + if hasattr(tokenizer, "_update_total_vocab_size"): + tokenizer._update_total_vocab_size() # Delete from text encoder text_embedding_dim = text_encoder.get_input_embeddings().embedding_dim From cbb2af84fda8c401872169bea7fdb661536df4c0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 28 Jan 2026 11:10:38 +0000 Subject: [PATCH 4/4] Apply style fixes --- src/diffusers/loaders/textual_inversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index 10d8218595ca..e4346700a7b8 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -565,7 +565,7 @@ def unload_textual_inversion( tokenizer_json["added_tokens"] = filtered tokenizer._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) else: - # Slow tokenizers + # Slow tokenizers for token_id, token_to_remove in zip(token_ids, tokens): del tokenizer._added_tokens_decoder[token_id] del tokenizer._added_tokens_encoder[token_to_remove]