diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 39c4f43a85..02a4b3fb21 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -795,6 +795,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数) - to_ollama: 产生ollama所需的Modelfile文件。默认为False。 - 🔥to_mcore: HF格式权重转成Megatron格式。默认为False。 - to_hf: Megatron格式权重转成HF格式。默认为False。 +- export_language_model_only: 只导出多模态模型中的语言模型部分,并写出文本模型配置。适用于只接受CausalLM格式权重名的文本推理后端。默认为False。 - mcore_model: mcore格式模型路径。默认为None。 - mcore_adapter: mcore格式模型的adapter路径,默认为None。 - thread_count: `--to_mcore true`时的模型切片数。默认为None,根据模型大小自动设置,使得最大分片小于10GB。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index aa12e6f235..ff2218ad31 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -816,6 +816,7 @@ Export Arguments include the [basic arguments](#base-arguments) and [merge argum - to_ollama: Generate the Modelfile required by Ollama. Default is False. - 🔥to_mcore: Convert weights from HF format to Megatron format. Default is False. - to_hf: Convert weights from Megatron format to HF format. Default is False. +- export_language_model_only: Export only the language model part of a multimodal model and write a text model config. This is intended for text-only inference backends that expect CausalLM-style weight names. Default is False. - mcore_model: Path to the mcore format model. Default is None. - mcore_adapter: The adapter path for mcore format models, default is None. - thread_count: The number of model slices when `--to_mcore true` is set. Defaults to None, and is automatically configured based on the model size, ensuring that the largest slice is less than 10GB. diff --git a/swift/arguments/export_args.py b/swift/arguments/export_args.py index cf08d8c72a..edda12fa04 100644 --- a/swift/arguments/export_args.py +++ b/swift/arguments/export_args.py @@ -30,6 +30,8 @@ class ExportArguments(MergeArguments, BaseArguments): to_ollama (bool): Whether to generate the `Modelfile` required by Ollama. Defaults to False. to_mcore (bool): Whether to convert Hugging Face format weights to Megatron-Core format. Defaults to False. to_hf (bool): Whether to convert Megatron-Core format weights to Hugging Face format. Defaults to False. + export_language_model_only (bool): Whether to export only the language model part of a multimodal model. + This is useful when a text-only inference backend expects CausalLM-style weight names. Defaults to False. mcore_model (Optional[str]): The path to the Megatron-Core format model. Defaults to None. mcore_adapter (Optional[str]): A list of adapter paths for the Megatron-Core format model. Defaults to []. thread_count (Optional[int]): The number of model shards when `to_mcore` is True. Defaults to None, which @@ -66,6 +68,7 @@ class ExportArguments(MergeArguments, BaseArguments): # megatron to_mcore: bool = False to_hf: bool = False + export_language_model_only: bool = False mcore_model: Optional[str] = None mcore_adapter: Optional[str] = None thread_count: Optional[int] = None @@ -101,6 +104,10 @@ def _init_output_dir(self): suffix = 'ollama' elif self.merge_lora: suffix = 'merged' + if self.export_language_model_only: + suffix += '-language-model' + elif self.export_language_model_only: + suffix = 'language-model' elif self.to_mcore: suffix = 'mcore' elif self.to_hf: @@ -126,6 +133,11 @@ def __post_init__(self): raise ValueError('Please specify `--quant_bits`.') if self.quant_method in {'gptq', 'awq'} and self.torch_dtype is None: self.torch_dtype = torch.float16 + if self.export_language_model_only and ( + self.quant_method or self.to_ollama or self.to_mcore or self.to_hf or self.to_cached_dataset + or self.to_peft_format): + raise ValueError( + '`export_language_model_only` only supports regular checkpoint export or merge-lora export.') if self.to_mcore or self.to_hf: if self.merge_lora: self.merge_lora = False diff --git a/swift/model/utils.py b/swift/model/utils.py index 7eaa9f5535..309d99b4df 100644 --- a/swift/model/utils.py +++ b/swift/model/utils.py @@ -1,9 +1,11 @@ # Copyright (c) ModelScope Contributors. All rights reserved. +import copy import os import shutil import torch import torch.nn.functional as F from accelerate.utils import find_device +from collections import OrderedDict from functools import wraps from packaging import version from peft import PeftModel @@ -13,7 +15,7 @@ from transformers.utils import (is_torch_bf16_gpu_available, is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available, strtobool) from types import MethodType -from typing import List, Optional, TypeVar, Union +from typing import Dict, List, Optional, TypeVar, Union from swift.utils import (HfConfigFactory, Processor, deep_getattr, get_dist_setting, get_env_args, get_logger, is_mp, to_device) @@ -286,6 +288,119 @@ def forward(self, x): _patch_conv3d() +def _get_language_model_prefixes(model: PreTrainedModel) -> List[str]: + model_arch = getattr(getattr(model, 'model_meta', None), 'model_arch', None) + prefixes = getattr(model_arch, 'language_model', None) or [] + if isinstance(prefixes, str): + prefixes = [prefixes] + return prefixes + + +def _get_language_model_target_prefix(source_prefix: str) -> str: + if source_prefix == 'language_model': + return 'model' + suffix = '.language_model' + if source_prefix.endswith(suffix): + return source_prefix[:-len(suffix)] + raise ValueError( + f'Cannot export language-model-only checkpoint for language_model prefix `{source_prefix}`. ' + 'Only `language_model` and `*.language_model` prefixes are currently supported.') + + +def get_language_model_state_dict(model: PreTrainedModel) -> Dict[str, torch.Tensor]: + prefixes = _get_language_model_prefixes(model) + if not prefixes: + raise ValueError('`export_language_model_only` requires a multimodal model with language_model prefixes.') + + source_prefix = prefixes[0].rstrip('.') + target_prefix = _get_language_model_target_prefix(source_prefix) + state_dict = model.state_dict() + output_state_dict = OrderedDict() + for key, value in state_dict.items(): + new_key = None + if key.startswith(f'{source_prefix}.'): + new_key = f'{target_prefix}.{key[len(source_prefix) + 1:]}' + elif key == source_prefix: + new_key = target_prefix + else: + for prefix in prefixes[1:]: + prefix = prefix.rstrip('.') + if key == prefix or key.startswith(f'{prefix}.'): + new_key = key + break + if new_key is None: + continue + if new_key in output_state_dict: + raise ValueError(f'Duplicate key `{new_key}` while exporting language-model-only checkpoint.') + output_state_dict[new_key] = value + + if not output_state_dict: + raise ValueError(f'No language model weights found with prefixes: {prefixes}.') + return output_state_dict + + +def _infer_language_model_architectures(model: PreTrainedModel) -> Optional[List[str]]: + candidates = [] + config = getattr(model, 'config', None) + if config is not None: + architectures = getattr(config, 'architectures', None) or [] + candidates.extend(architectures) + candidates.append(model.__class__.__name__) + + for arch in candidates: + if arch.endswith('ForCausalLM'): + return [arch] + if arch.endswith('ForConditionalGeneration'): + return [arch[:-len('ForConditionalGeneration')] + 'ForCausalLM'] + return None + + +def get_language_model_config(model: PreTrainedModel) -> PretrainedConfig: + text_config = HfConfigFactory.get_text_config(model.config) + if text_config is model.config: + raise ValueError('`export_language_model_only` requires a multimodal config with a text config.') + + text_config = copy.deepcopy(text_config) + architectures = _infer_language_model_architectures(model) + if architectures is not None: + text_config.architectures = architectures + return text_config + + +def save_language_model_checkpoint(model: PreTrainedModel, + output_dir: str, + *, + safe_serialization: bool = True, + max_shard_size: Union[int, str] = '5GB') -> None: + try: + from huggingface_hub import save_torch_state_dict + except ImportError as e: + raise ImportError('`export_language_model_only` requires `huggingface_hub.save_torch_state_dict`.') from e + + os.makedirs(output_dir, exist_ok=True) + state_dict = get_language_model_state_dict(model) + text_config = get_language_model_config(model) + text_config.save_pretrained(output_dir) + generation_config = getattr(model, 'generation_config', None) + if generation_config is not None: + generation_config.save_pretrained(output_dir) + save_torch_state_dict( + state_dict, + output_dir, + max_shard_size=max_shard_size, + safe_serialization=safe_serialization, + metadata={'format': 'pt'} if safe_serialization else None) + + +def save_processor_checkpoint(processor: Processor, output_dir: str, *, language_model_only: bool = False) -> None: + if language_model_only: + tokenizer = getattr(processor, 'tokenizer', None) + if tokenizer is not None: + tokenizer.save_pretrained(output_dir) + return + processor.save_pretrained(output_dir) + + def save_checkpoint(model: Optional[PreTrainedModel], processor: Processor, output_dir: str, @@ -293,9 +408,13 @@ def save_checkpoint(model: Optional[PreTrainedModel], safe_serialization: bool = True, max_shard_size: Union[int, str] = '5GB', model_dirs: List[str] = None, - additional_saved_files: Optional[List[str]] = None) -> None: + additional_saved_files: Optional[List[str]] = None, + language_model_only: bool = False) -> None: if model is not None: - if model.__class__.__name__ != 'SentenceTransformer': + if language_model_only: + save_language_model_checkpoint( + model, output_dir, safe_serialization=safe_serialization, max_shard_size=max_shard_size) + elif model.__class__.__name__ != 'SentenceTransformer': model.save_pretrained(output_dir, safe_serialization=safe_serialization, max_shard_size=max_shard_size) else: model.save_pretrained(output_dir, safe_serialization=safe_serialization) @@ -303,7 +422,7 @@ def save_checkpoint(model: Optional[PreTrainedModel], from swift.utils import copy_files_by_pattern copy_files_by_pattern(model.model_dir, output_dir, '*.py') copy_files_by_pattern(model.model_dir, output_dir, '*.json') - processor.save_pretrained(output_dir) + save_processor_checkpoint(processor, output_dir, language_model_only=language_model_only) if model_dirs is None: model_dirs = [] @@ -311,7 +430,10 @@ def save_checkpoint(model: Optional[PreTrainedModel], model_dirs = model_dirs.copy() if model and model.model_dir and model.model_dir not in model_dirs: model_dirs.append(model.model_dir) - for src_file in (additional_saved_files or []) + ['preprocessor_config.json', 'args.json']: + src_files = (additional_saved_files or []) + ['args.json'] + if not language_model_only: + src_files.append('preprocessor_config.json') + for src_file in src_files: tgt_path = os.path.join(output_dir, src_file) if os.path.exists(tgt_path) and src_file == 'args.json': continue diff --git a/swift/pipelines/export/export.py b/swift/pipelines/export/export.py index 2f202bbc41..fc8f04cd89 100644 --- a/swift/pipelines/export/export.py +++ b/swift/pipelines/export/export.py @@ -6,7 +6,7 @@ from swift.tuners import swift_to_peft_format from swift.utils import get_logger from .cached_dataset import export_cached_dataset -from .merge_lora import merge_lora +from .merge_lora import export_language_model, merge_lora from .ollama import export_to_ollama from .quant import quantize_model @@ -27,6 +27,8 @@ def run(self): args.output_dir = None merge_lora(args) args.output_dir = output_dir # recover + if args.export_language_model_only and not args.merge_lora: + export_language_model(args) if args.quant_method: quantize_model(args) elif args.to_ollama: diff --git a/swift/pipelines/export/merge_lora.py b/swift/pipelines/export/merge_lora.py index 49368aa2ee..dc56514868 100644 --- a/swift/pipelines/export/merge_lora.py +++ b/swift/pipelines/export/merge_lora.py @@ -52,10 +52,40 @@ def merge_lora(args: ExportArguments, device_map=None, replace_if_exists=False) safe_serialization=args.safe_serialization, model_dirs=args.adapters, max_shard_size=args.max_shard_size, - additional_saved_files=model.model_meta.additional_saved_files) + additional_saved_files=model.model_meta.additional_saved_files, + language_model_only=args.export_language_model_only) logger.info(f'Successfully merged LoRA and saved in `{output_dir}`.') args.device_map = origin_device_map args.model = output_dir args.model_dir = output_dir args.adapters = [] + + +def export_language_model(args: ExportArguments, device_map=None, replace_if_exists=False) -> None: + if replace_if_exists: + logger.info(f'replace_if_exists: {replace_if_exists}') + output_dir = args.output_dir + if os.path.exists(output_dir) and not replace_if_exists: + logger.info(f'The language model export already exists in {output_dir}, skipping the saving process.') + else: + origin_device_map = args.device_map + args.device_map = device_map or args.device_map + logger.info(f'export_device_map: {device_map}') + model, template = prepare_model_template(args) + logger.info('Saving language model weights...') + save_checkpoint( + model, + template.processor, + output_dir, + safe_serialization=args.safe_serialization, + model_dirs=[args.model_dir], + max_shard_size=args.max_shard_size, + additional_saved_files=model.model_meta.additional_saved_files, + language_model_only=True) + logger.info(f'Successfully exported language model weights in `{output_dir}`.') + args.device_map = origin_device_map + + args.model = output_dir + args.model_dir = output_dir + args.adapters = [] diff --git a/tests/export/test_language_model_export.py b/tests/export/test_language_model_export.py new file mode 100644 index 0000000000..38edf5d550 --- /dev/null +++ b/tests/export/test_language_model_export.py @@ -0,0 +1,124 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import json +import os +import tempfile +from types import SimpleNamespace + +import torch +from safetensors.torch import load_file +from transformers import PretrainedConfig + +from swift.model import save_checkpoint + + +class TextConfig(PretrainedConfig): + model_type = 'qwen3_5_text' + + +class DummyProcessor: + + def __init__(self): + self.tokenizer = DummyTokenizer() + + def save_pretrained(self, output_dir): + with open(os.path.join(output_dir, 'preprocessor_config.json'), 'w') as f: + json.dump({'processor': 'multimodal'}, f) + + +class DummyTokenizer: + + def save_pretrained(self, output_dir): + with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w') as f: + json.dump({}, f) + + +class DummyQwen35Model: + + def __init__(self): + self.config = PretrainedConfig() + self.config.architectures = ['Qwen3_5ForConditionalGeneration'] + self.config.text_config = TextConfig() + self.config.text_config.architectures = ['Qwen3_5ForConditionalGeneration'] + self.model_meta = SimpleNamespace( + model_arch=SimpleNamespace(language_model=['model.language_model', 'lm_head']), additional_saved_files=[]) + self.model_dir = None + + def state_dict(self): + return { + 'model.language_model.embed_tokens.weight': torch.ones(2, 2), + 'model.language_model.layers.0.self_attn.q_proj.weight': torch.ones(2, 2) * 2, + 'lm_head.weight': torch.ones(2, 2) * 3, + 'model.visual.patch_embed.weight': torch.ones(2, 2) * 4, + } + + +class DummyDefaultModel(DummyQwen35Model): + + def save_pretrained(self, output_dir, safe_serialization=True, max_shard_size='5GB'): + with open(os.path.join(output_dir, 'model_saved.json'), 'w') as f: + json.dump({ + 'safe_serialization': safe_serialization, + 'max_shard_size': max_shard_size, + }, f) + + +def test_save_checkpoint_default_multimodal_export_unchanged(): + with tempfile.TemporaryDirectory() as tmp_dir: + output_dir = os.path.join(tmp_dir, 'output') + model_dir = os.path.join(tmp_dir, 'model') + os.makedirs(output_dir) + os.makedirs(model_dir) + with open(os.path.join(model_dir, 'preprocessor_config.json'), 'w') as f: + json.dump({'source': 'vision'}, f) + + save_checkpoint( + DummyDefaultModel(), + DummyProcessor(), + output_dir, + safe_serialization=False, + max_shard_size='1GB', + model_dirs=[model_dir]) + + with open(os.path.join(output_dir, 'model_saved.json')) as f: + model_saved = json.load(f) + assert model_saved == {'safe_serialization': False, 'max_shard_size': '1GB'} + + with open(os.path.join(output_dir, 'preprocessor_config.json')) as f: + processor_config = json.load(f) + assert processor_config == {'source': 'vision'} + assert not os.path.exists(os.path.join(output_dir, 'tokenizer_config.json')) + + +def test_save_checkpoint_export_language_model_only(): + with tempfile.TemporaryDirectory() as tmp_dir: + output_dir = os.path.join(tmp_dir, 'output') + model_dir = os.path.join(tmp_dir, 'model') + os.makedirs(model_dir) + with open(os.path.join(model_dir, 'preprocessor_config.json'), 'w') as f: + json.dump({'source': 'vision'}, f) + with open(os.path.join(model_dir, 'args.json'), 'w') as f: + json.dump({'source': 'args'}, f) + + save_checkpoint( + DummyQwen35Model(), + DummyProcessor(), + output_dir, + safe_serialization=True, + max_shard_size='10GB', + model_dirs=[model_dir], + language_model_only=True) + + state_dict = load_file(os.path.join(output_dir, 'model.safetensors')) + assert set(state_dict) == { + 'model.embed_tokens.weight', + 'model.layers.0.self_attn.q_proj.weight', + 'lm_head.weight', + } + + with open(os.path.join(output_dir, 'config.json')) as f: + config = json.load(f) + assert config['model_type'] == 'qwen3_5_text' + assert config['architectures'] == ['Qwen3_5ForCausalLM'] + assert os.path.exists(os.path.join(output_dir, 'tokenizer_config.json')) + assert os.path.exists(os.path.join(output_dir, 'args.json')) + assert not os.path.exists(os.path.join(output_dir, 'preprocessor_config.json'))