chore: add .gitignore

This commit is contained in:
dom
2026-03-05 00:37:36 +01:00
parent 06100df236
commit e26be72f9c
449 changed files with 504051 additions and 57 deletions

View File

@@ -0,0 +1,89 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.aqlm import (torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor):
# note: logic differs from default Linear because merging is not supported
result = self.base_layer(x)
if self.disable_adapters:
return result
for active_adapter in self.active_adapters:
if active_adapter not in self.lora_A.keys():
continue
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
requires_conversion = not torch.is_autocast_enabled()
if requires_conversion:
expected_dtype = result.dtype
x = self._cast_input_dtype(x, lora_A.weight.dtype)
output = lora_B(lora_A(dropout(x)))
if requires_conversion:
output = output.to(expected_dtype)
output = output * scaling
result += output
return result

View File

@@ -0,0 +1,88 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.awq import (torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor):
result = self.quant_linear_module(x)
if self.disable_adapters:
return result
for active_adapter in self.active_adapters:
if active_adapter not in self.lora_A.keys():
continue
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
requires_conversion = not torch.is_autocast_enabled()
if requires_conversion:
expected_dtype = result.dtype
x = self._cast_input_dtype(x, lora_A.weight.dtype)
output = lora_B(lora_A(dropout(x)))
if requires_conversion:
output = output.to(expected_dtype)
output = output * scaling
result = result + output
return result

View File

@@ -0,0 +1,117 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (nn)
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
# exponential_average_factor is set to self.momentum
# (when it is available) only so that it gets updated
# in ONNX graph when this node is exported to ONNX.
if self.momentum is None:
exponential_average_factor = 0.0
else:
exponential_average_factor = self.momentum
if self.training and self.track_running_stats:
# TODO: if statement only here to tell the jit to skip emitting this when it is None
if self.num_batches_tracked is not None: # type: ignore[has-type]
self.num_batches_tracked.add_(1) # type: ignore[has-type]
if self.momentum is None: # use cumulative moving average
exponential_average_factor = 1.0 / float(self.num_batches_tracked)
else: # use exponential moving average
exponential_average_factor = self.momentum
r"""
Decide whether the mini-batch stats should be used for normalization rather than the buffers.
Mini-batch stats are used in training mode, and in eval mode when buffers are None.
"""
if self.training:
bn_training = True
else:
bn_training = (self.running_mean is None) and (self.running_var is None)
r"""
Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
used for normalization (i.e. in eval mode when buffers are not None).
"""
return F.batch_norm(
input,
# If buffers are not to be tracked, ensure that they won't be updated
(
self.running_mean
if not self.training or self.track_running_stats
else None
),
self.running_var if not self.training or self.track_running_stats else None,
self.weight,
self.bias,
bn_training,
exponential_average_factor,
self.eps,
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,117 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (nn)
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
# exponential_average_factor is set to self.momentum
# (when it is available) only so that it gets updated
# in ONNX graph when this node is exported to ONNX.
if self.momentum is None:
exponential_average_factor = 0.0
else:
exponential_average_factor = self.momentum
if self.training and self.track_running_stats:
# TODO: if statement only here to tell the jit to skip emitting this when it is None
if self.num_batches_tracked is not None: # type: ignore[has-type]
self.num_batches_tracked.add_(1) # type: ignore[has-type]
if self.momentum is None: # use cumulative moving average
exponential_average_factor = 1.0 / float(self.num_batches_tracked)
else: # use exponential moving average
exponential_average_factor = self.momentum
r"""
Decide whether the mini-batch stats should be used for normalization rather than the buffers.
Mini-batch stats are used in training mode, and in eval mode when buffers are None.
"""
if self.training:
bn_training = True
else:
bn_training = (self.running_mean is None) and (self.running_var is None)
r"""
Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
used for normalization (i.e. in eval mode when buffers are not None).
"""
return F.batch_norm(
input,
# If buffers are not to be tracked, ensure that they won't be updated
(
self.running_mean
if not self.training or self.track_running_stats
else None
),
self.running_var if not self.training or self.track_running_stats else None,
self.weight,
self.bias,
bn_training,
exponential_average_factor,
self.eps,
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,117 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (nn)
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
# exponential_average_factor is set to self.momentum
# (when it is available) only so that it gets updated
# in ONNX graph when this node is exported to ONNX.
if self.momentum is None:
exponential_average_factor = 0.0
else:
exponential_average_factor = self.momentum
if self.training and self.track_running_stats:
# TODO: if statement only here to tell the jit to skip emitting this when it is None
if self.num_batches_tracked is not None: # type: ignore[has-type]
self.num_batches_tracked.add_(1) # type: ignore[has-type]
if self.momentum is None: # use cumulative moving average
exponential_average_factor = 1.0 / float(self.num_batches_tracked)
else: # use exponential moving average
exponential_average_factor = self.momentum
r"""
Decide whether the mini-batch stats should be used for normalization rather than the buffers.
Mini-batch stats are used in training mode, and in eval mode when buffers are None.
"""
if self.training:
bn_training = True
else:
bn_training = (self.running_mean is None) and (self.running_var is None)
r"""
Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
used for normalization (i.e. in eval mode when buffers are not None).
"""
return F.batch_norm(
input,
# If buffers are not to be tracked, ensure that they won't be updated
(
self.running_mean
if not self.training or self.track_running_stats
else None
),
self.running_var if not self.training or self.track_running_stats else None,
self.weight,
self.bias,
bn_training,
exponential_average_factor,
self.eps,
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,70 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
def forward(self, input: Tensor) -> Tensor:
return self._conv_forward(input, self.weight, self.bias).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,70 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
def forward(self, input: Tensor) -> Tensor:
return self._conv_forward(input, self.weight, self.bias).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,70 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
def forward(self, input: Tensor) -> Tensor:
return self._conv_forward(input, self.weight, self.bias).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,97 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (nn)
def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor:
if self.padding_mode != "zeros":
raise ValueError(
"Only `zeros` padding mode is supported for ConvTranspose1d"
)
assert isinstance(self.padding, tuple)
# One cannot replace List by Tuple or Sequence in "_output_padding" because
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
num_spatial_dims = 1
output_padding = self._output_padding(
input,
output_size,
self.stride, # type: ignore[arg-type]
self.padding, # type: ignore[arg-type]
self.kernel_size, # type: ignore[arg-type]
num_spatial_dims,
self.dilation, # type: ignore[arg-type]
)
return F.conv_transpose1d(
input,
self.weight,
self.bias,
self.stride,
self.padding,
output_padding,
self.groups,
self.dilation,
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,106 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (nn)
def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor:
"""
Performs the forward pass.
Attributes:
input (Tensor): The input tensor.
output_size (list[int], optional): A list of integers representing
the size of the output tensor. Default is None.
"""
if self.padding_mode != "zeros":
raise ValueError(
"Only `zeros` padding mode is supported for ConvTranspose2d"
)
assert isinstance(self.padding, tuple)
# One cannot replace List by Tuple or Sequence in "_output_padding" because
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
num_spatial_dims = 2
output_padding = self._output_padding(
input,
output_size,
self.stride, # type: ignore[arg-type]
self.padding, # type: ignore[arg-type]
self.kernel_size, # type: ignore[arg-type]
num_spatial_dims,
self.dilation, # type: ignore[arg-type]
)
return F.conv_transpose2d(
input,
self.weight,
self.bias,
self.stride,
self.padding,
output_padding,
self.groups,
self.dilation,
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,98 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (nn)
def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor:
if self.padding_mode != "zeros":
raise ValueError(
"Only `zeros` padding mode is supported for ConvTranspose3d"
)
assert isinstance(self.padding, tuple)
# One cannot replace List by Tuple or Sequence in "_output_padding" because
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
num_spatial_dims = 3
output_padding = self._output_padding(
input,
output_size,
self.stride, # type: ignore[arg-type]
self.padding, # type: ignore[arg-type]
self.kernel_size, # type: ignore[arg-type]
num_spatial_dims,
self.dilation, # type: ignore[arg-type]
)
return F.conv_transpose3d(
input,
self.weight,
self.bias,
self.stride,
self.padding,
output_padding,
self.groups,
self.dilation,
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,96 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.gptq import (torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor):
# note: logic differs from default Linear because merging is not supported
result = self.quant_linear_module(x)
if self.disable_adapters:
return result
lora_A_keys = self.lora_A.keys()
for active_adapter in self.active_adapters:
if active_adapter not in lora_A_keys:
continue
torch_result_dtype = result.dtype
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
if not torch.is_autocast_enabled(): result, x = result.to(lora_A.weight.dtype), x.to(lora_A.weight.dtype)
if active_adapter not in self.lora_variant: # vanilla LoRA
return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
else:
result = self.lora_variant[active_adapter].forward(
self,
active_adapter=active_adapter,
x=x,
result=result,
)
result = result.to(torch_result_dtype)
return result

View File

@@ -0,0 +1,70 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
def forward(self, input: Tensor) -> Tensor:
return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,72 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
def forward(self, input: Tensor) -> Tensor:
return F.layer_norm(
input, self.normalized_shape, self.weight, self.bias, self.eps
).to(input.dtype).to(input.dtype)

View File

@@ -0,0 +1,126 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
try:
from peft.tuners.lora.layer import VARIANT_KWARG_KEYS
except ImportError:
VARIANT_KWARG_KEYS = ['alora_offsets']
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.bnb import (VARIANT_KWARG_KEYS, torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
adapter_names = kwargs.pop("adapter_names", None)
variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer
if self.disable_adapters:
if self.merged:
self.unmerge()
if not torch.is_autocast_enabled() and hasattr(self.base_layer, 'weight') and self.base_layer.weight is not None and not hasattr(self.base_layer.weight, 'quant_state') and x.dtype != self.base_layer.weight.dtype:
x = x.to(self.base_layer.weight.dtype)
result = self.base_layer(x, *args, **kwargs)
elif adapter_names is not None:
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs)
elif self.merged:
if not torch.is_autocast_enabled() and hasattr(self.base_layer, 'weight') and self.base_layer.weight is not None and not hasattr(self.base_layer.weight, 'quant_state') and x.dtype != self.base_layer.weight.dtype:
x = x.to(self.base_layer.weight.dtype)
result = self.base_layer(x, *args, **kwargs)
else:
if not torch.is_autocast_enabled() and hasattr(self.base_layer, 'weight') and self.base_layer.weight is not None and not hasattr(self.base_layer.weight, 'quant_state') and x.dtype != self.base_layer.weight.dtype:
x = x.to(self.base_layer.weight.dtype)
result = self.base_layer(x, *args, **kwargs)
# As per Tim Dettmers, for 4bit, we need to defensively clone here.
# The reason is that in some cases, an error can occur that backprop
# does not work on a manipulated view. This issue may be solved with
# newer PyTorch versions but this would need extensive testing to be
# sure.
for active_adapter in self.active_adapters:
if active_adapter not in self.lora_A.keys():
continue
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
requires_conversion = not torch.is_autocast_enabled()
if requires_conversion:
expected_dtype = result.dtype
x = self._cast_input_dtype(x, lora_A.weight.dtype)
if active_adapter not in self.lora_variant: # vanilla LoRA
return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
if requires_conversion:
output = output.to(expected_dtype)
result = result + output
else:
result = self.lora_variant[active_adapter].forward(
self,
active_adapter=active_adapter,
x=x,
result=result,
**variant_kwargs,
**kwargs,
)
if requires_conversion:
result = result.to(expected_dtype)
return result

View File

@@ -0,0 +1,118 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
try:
from peft.tuners.lora.layer import VARIANT_KWARG_KEYS
except ImportError:
VARIANT_KWARG_KEYS = ['alora_offsets']
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
import torch._dynamo
@torch._dynamo.disable
def _call_8bit_base_layer(base_layer, x, *args, **kwargs):
return base_layer(x, *args, **kwargs)
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.bnb import (VARIANT_KWARG_KEYS, torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
adapter_names = kwargs.pop("adapter_names", None)
variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer
if self.disable_adapters:
if self.merged:
self.unmerge()
result = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)
elif adapter_names is not None:
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs)
elif self.merged:
result = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)
else:
result = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)
for active_adapter in self.active_adapters:
if active_adapter not in self.lora_A.keys():
continue
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
requires_conversion = not torch.is_autocast_enabled()
if requires_conversion:
expected_dtype = result.dtype
x = self._cast_input_dtype(x, lora_A.weight.dtype)
if active_adapter not in self.lora_variant: # vanilla LoRA
return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
if requires_conversion:
output = output.to(expected_dtype)
result = result + output
else:
result = self.lora_variant[active_adapter].forward(
self,
active_adapter=active_adapter,
x=x,
result=result,
**variant_kwargs,
**kwargs,
)
if requires_conversion:
result = result.to(expected_dtype)
return result

View File

@@ -0,0 +1,115 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
try:
from peft.tuners.lora.layer import VARIANT_KWARG_KEYS
except ImportError:
VARIANT_KWARG_KEYS = ['alora_offsets']
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.variants import (Any, torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
adapter_names = kwargs.pop("adapter_names", None)
variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer
if self.disable_adapters:
if self.merged:
self.unmerge()
if not torch.is_autocast_enabled() and hasattr(self.base_layer, 'weight') and self.base_layer.weight is not None and not hasattr(self.base_layer.weight, 'quant_state') and x.dtype != self.base_layer.weight.dtype:
x = x.to(self.base_layer.weight.dtype)
result = self.base_layer(x, *args, **kwargs)
elif adapter_names is not None:
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs)
elif self.merged:
if not torch.is_autocast_enabled() and hasattr(self.base_layer, 'weight') and self.base_layer.weight is not None and not hasattr(self.base_layer.weight, 'quant_state') and x.dtype != self.base_layer.weight.dtype:
x = x.to(self.base_layer.weight.dtype)
result = self.base_layer(x, *args, **kwargs)
else:
if not torch.is_autocast_enabled() and hasattr(self.base_layer, 'weight') and self.base_layer.weight is not None and not hasattr(self.base_layer.weight, 'quant_state') and x.dtype != self.base_layer.weight.dtype:
x = x.to(self.base_layer.weight.dtype)
result = self.base_layer(x, *args, **kwargs)
torch_result_dtype = result.dtype
lora_A_keys = self.lora_A.keys()
for active_adapter in self.active_adapters:
if active_adapter not in lora_A_keys:
continue
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
if not torch.is_autocast_enabled(): result, x = result.to(lora_A.weight.dtype), x.to(lora_A.weight.dtype)
if active_adapter not in self.lora_variant: # vanilla LoRA
return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
else:
result = self.lora_variant[active_adapter].forward(
self,
active_adapter=active_adapter,
x=x,
result=result,
**variant_kwargs,
**kwargs,
)
result = result.to(torch_result_dtype)
return result

View File

@@ -0,0 +1,92 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from peft.tuners.lora.tp_layer import (Any, __name__, torch)
torch_addmm = torch.addmm
torch_add = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
# Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
# by _cast_input_dtype when autocast is disabled
target_dtype = result.dtype
xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
# output = result + scaling * xA @ lora_B.weight.t()
shape = result.shape
output = torch_addmm(
result.view(-1, shape[-1]),
xA.view(-1, xA.shape[-1]),
lora_B.weight.to(target_dtype).t(),
alpha = scaling,
beta = 1,
).view(shape)
bias = lora_B.bias
if bias is not None:
output = torch_add(
output,
bias.to(target_dtype),
alpha = scaling,
)
return output
pass
def unsloth_forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
adapter_names = kwargs.pop("adapter_names", None)
# If weight is used for matrix multiplication here, the final aggregation operation of the original
# parallel_linear layer will be missing, so we need to directly call its forward function to obtain the
# output of the original parallel_linear layer.
if self.disable_adapters:
if self.merged:
self.unmerge()
result, bias = self.base_layer(x, *args, **kwargs)
elif adapter_names is not None:
raise ValueError(f"{self.__class__.__name__} does not support mixed_batch_forward yet.")
elif self.merged:
result, bias = self.base_layer(x, *args, **kwargs)
else:
result, bias = self.base_layer(x, *args, **kwargs)
torch_result_dtype = result.dtype
for active_adapter in self.active_adapters:
if active_adapter not in self.lora_A.keys():
continue
lora_A = self.lora_A[active_adapter]
lora_B = self.lora_B[active_adapter]
dropout = self.lora_dropout[active_adapter]
scaling = self.scaling[active_adapter]
if not torch.is_autocast_enabled(): result, x = result.to(lora_A.weight.dtype), x.to(lora_A.weight.dtype)
return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
result = result.to(torch_result_dtype)
return result, bias

View File

@@ -0,0 +1,73 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (torch)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Runs the forward pass.
"""
return F.rms_norm(x, self.normalized_shape, self.weight, self.eps).to(input.dtype).to(input.dtype)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,984 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
from unsloth_zoo.loss_utils import (
fused_linear_cross_entropy,
unsloth_fused_ce_loss,
)
if UNSLOTH_STUDIO_ENABLED:
from unsloth_zoo.loss_utils import fast_linear_cross_entropy
scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
@torch.compiler.disable(recursive = False)
def disable_compile_scaled_dot_product_attention(*args, **kwargs):
return scaled_dot_product_attention(*args, **kwargs)
pass
from transformers.modeling_flash_attention_utils import is_flash_attn_available
if is_flash_attn_available():
try:
from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask
except:
flash_attn_supports_top_left_mask = None
try:
from transformers.modeling_flash_attention_utils import _flash_attention_forward
except:
_flash_attention_forward = None
try:
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
except:
FlashAttentionKwargs = None
try:
from transformers.modeling_flash_attention_utils import flash_attn_varlen_func
except:
flash_attn_varlen_func = None
else:
flash_attn_supports_top_left_mask = None
_flash_attention_forward = None
FlashAttentionKwargs = None
flash_attn_varlen_func = None
pass
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch.nn import CrossEntropyLoss
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def normal_cross_entropy_loss(self, hidden_states, labels):
logits = self.lm_head(hidden_states)
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
return loss, logits
pass
# We need an empty logits flag to warn people logits will not be returned anymore unless asked ie
# os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
LOGITS_ERROR_STRING = \
"Unsloth: Logits are empty from 2024.11 onwards. To get raw logits again, please "\
'set the environment variable `UNSLOTH_RETURN_LOGITS` to `"1" BEFORE starting to train ie before `trainer.train()`. For example:\n'\
"```\nimport os\n"\
"os.environ['UNSLOTH_RETURN_LOGITS'] = '1'\n"\
"trainer.train()\n```\n"\
"No need to restart your console - just add `os.environ['UNSLOTH_RETURN_LOGITS'] = '1'` before trainer.train() and re-run the cell!"
def raise_logits_error(*args, **kwargs): raise NotImplementedError(LOGITS_ERROR_STRING)
def return_none(*args, **kwargs): return None
class EmptyLogits:
def __init__(self): return
def raise_getattr_error(self, attr): return return_none if attr == "to" else raise_logits_error
__getitem__ = raise_logits_error
__getattr__ = raise_getattr_error
def __repr__(self): return LOGITS_ERROR_STRING
def __str__ (self): return LOGITS_ERROR_STRING
pass
EMPTY_LOGITS = EmptyLogits()
functions = dir(torch.Tensor)
for j, function in enumerate(functions):
if function.startswith("__") and function.endswith("__"):
exec(f"def raise_{j}(*args, **kwargs): print('{function}')", globals(), locals())
try: exec(f"EMPTY_LOGITS.{function} = raise_{j}", globals(), locals())
except: continue
pass
def mask_attention_mask_out(labels = None, attention_mask = None):
if labels is not None and attention_mask is not None:
attention_mask = attention_mask.to(device = labels.device)
labels[attention_mask == 0] = -100
return labels
pass
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.gemma3.modeling_gemma3 import (Callable, Optional, Union, torch, nn, ACT2FN, Cache, PretrainedConfig, GenerationMixin, BaseModelOutputWithPast, ModelOutput, CausalLMOutputWithPast, ROPE_INIT_FUNCTIONS, dynamic_rope_update, PreTrainedModel, can_return_tuple, Gemma3Config, Gemma3TextConfig, logger, __name__, Gemma3Model, Gemma3CausalLMOutputWithPast, Gemma3PreTrainedModel, Gemma3TextModel, Gemma3ForCausalLM, Gemma3ForConditionalGeneration, create_masks_for_generate)
@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def Gemma3MLP_forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
class Gemma3MLP(nn.Module):
def __init__(self, config: Gemma3TextConfig):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_activation]
def forward(self, x):
return Gemma3MLP_forward(self, x)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def Gemma3RMSNorm_forward(self, x):
x_fp32 = x.to(torch.float32)
variance = x_fp32.pow(2).mean(-1, keepdim=True)
hidden_states_fp32 = x_fp32 * torch.rsqrt(variance + self.eps)
output_fp32 = hidden_states_fp32 * (1.0 + self.weight.to(torch.float32))
return output_fp32.to(x.dtype)
class Gemma3RMSNorm(nn.Module):
def __init__(self, dim: int, eps: float = 1e-6):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.zeros(dim))
def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
def forward(self, x):
output = self._norm(x.float())
# Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
# See https://github.com/huggingface/transformers/pull/29402
output = output * (1.0 + self.weight.float())
return output.type_as(x)
def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.eps}"
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
@torch.no_grad()
@dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
def Gemma3RotaryEmbedding_forward(self, x, position_ids):
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
position_ids_expanded = position_ids[:, None, :].float()
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
with torch.autocast(device_type=device_type, enabled=False): # Force float32
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
emb = torch.cat((freqs, freqs), dim=-1)
cos = emb.cos() * self.attention_scaling
sin = emb.sin() * self.attention_scaling
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
class Gemma3RotaryEmbedding(nn.Module):
inv_freq: torch.Tensor # fix linting for `register_buffer`
def __init__(self, config: Gemma3TextConfig, device=None):
super().__init__()
# BC: "rope_type" was originally "type"
if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
else:
self.rope_type = "default"
self.max_seq_len_cached = config.max_position_embeddings
self.original_max_seq_len = config.max_position_embeddings
self.config = config
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
self.register_buffer("inv_freq", inv_freq, persistent=False)
self.original_inv_freq = self.inv_freq
def forward(self, x, position_ids):
return Gemma3RotaryEmbedding_forward(self, x, position_ids)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`, *optional*):
Deprecated and unused.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.Tensor],
dropout: float = 0.0,
scaling: Optional[float] = None,
softcap: Optional[float] = None,
**kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
if scaling is None:
scaling = module.head_dim**-0.5
key_states = repeat_kv(key, module.num_key_value_groups)
value_states = repeat_kv(value, module.num_key_value_groups)
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
if softcap is not None:
attn_weights = attn_weights / softcap
attn_weights = torch.tanh(attn_weights)
attn_weights = attn_weights * softcap
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype = torch.float32).to(attn_weights.dtype).to(query.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
attn_output = torch.matmul(attn_weights, value_states)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output, attn_weights
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def Gemma3MultiModalProjector_forward(self, vision_outputs: torch.Tensor):
batch_size, _, seq_length = vision_outputs.shape
reshaped_vision_outputs = vision_outputs.transpose(1, 2)
reshaped_vision_outputs = reshaped_vision_outputs.reshape(
batch_size, seq_length, self.patches_per_image, self.patches_per_image
)
reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
pooled_vision_outputs = pooled_vision_outputs.flatten(2)
pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
return projected_vision_outputs.type_as(vision_outputs)
class Gemma3MultiModalProjector(nn.Module):
def __init__(self, config: Gemma3Config):
super().__init__()
self.mm_input_projection_weight = nn.Parameter(
torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
)
self.mm_soft_emb_norm = Gemma3RMSNorm(
config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
)
self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
self.kernel_size = self.patches_per_image // self.tokens_per_side
self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
def forward(self, vision_outputs: torch.Tensor):
return Gemma3MultiModalProjector_forward(self, vision_outputs)
def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:
"""
Enables a bidirectional mask within the sliding window.
"""
def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
"""A token can attend to any other token if their absolute distance is within
the (exclusive) sliding window size (distance < sliding_window)."""
return abs(q_idx - kv_idx) < sliding_window
return inner_mask
@torch.compiler.disable(recursive = False)
@can_return_tuple
def Gemma3ForCausalLM_forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
) -> CausalLMOutputWithPast:
r"""
Example:
```python
>>> from transformers import AutoTokenizer, Gemma3ForCausalLM
>>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs: BaseModelOutputWithPast = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
cache_position=cache_position,
**kwargs,
)
hidden_states = outputs.last_hidden_state
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
logits = self.lm_head(hidden_states[:, slice_indices, :]) if os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '1' else EMPTY_LOGITS
loss = None
NOT_RETURN_LOGITS = os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '0'
RETURN_HIDDEN_STATES = os.environ.get("UNSLOTH_RETURN_HIDDEN_STATES", "0") == "1"
n_items = None
if (kwargs) != () and type(kwargs) is dict:
n_items = (kwargs).get("num_items_in_batch", None)
if n_items is None: n_items = (kwargs).get("n_items", None)
if n_items is None:
all_locals = locals()
if 'loss_kwargs' in all_locals:
__kwargs = all_locals['loss_kwargs']
if type(__kwargs) is dict:
n_items = __kwargs.get("num_items_in_batch", None)
if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None and 'kwargs' in all_locals:
__kwargs = all_locals['kwargs']
if type(__kwargs) is dict:
n_items = __kwargs.get("num_items_in_batch", None)
if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None:
all_locals = all_locals.values()
for __kwargs in all_locals:
if type(__kwargs) is dict:
n_items = __kwargs.get("num_items_in_batch", None)
if n_items is None: n_items = __kwargs.get("n_items", None)
break
pass
requires_grad_ = self.lm_head.weight.requires_grad
requires_grad_ = requires_grad_ or self.lm_head.weight.dtype == torch.float32
if RETURN_HIDDEN_STATES:
logits = hidden_states[:, slice_indices, :]
elif labels is None:
# Set compiler stance to fail on recompiles for inference
global INFERENCE_RUNS
if torch_dynamo_eval_frame is not None:
old_stance = torch_dynamo_eval_frame._stance.stance
else:
old_stance = None
if old_stance is not None and INFERENCE_RUNS == 1:
# Skip guards and return to eager -> we still need guards!
torch_compiler_set_stance(stance = "eager_on_recompile", skip_guard_eval_unsafe = False)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.info(
f"Unsloth: Removing compiler guards after 1 inference run. " \
f"DYNAMO_STANCE.stance = {torch_dynamo_eval_frame._stance.stance} " \
f"DYNAMO_STANCE.skip_guard_eval_unsafe = {torch_dynamo_eval_frame._stance.skip_guard_eval_unsafe}"
)
elif old_stance == "eager_on_recompile":
pass
elif old_stance == "default" and INFERENCE_RUNS > 1:
# Reset compiler stance
torch_compiler_set_stance(stance = "default", skip_guard_eval_unsafe = False)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.info(
f"Unsloth: Reseting guards. " \
f"DYNAMO_STANCE.stance = {torch_dynamo_eval_frame._stance.stance} " \
f"DYNAMO_STANCE.skip_guard_eval_unsafe = {torch_dynamo_eval_frame._stance.skip_guard_eval_unsafe}"
)
INFERENCE_RUNS = 0
INFERENCE_RUNS += 1
logits = self.lm_head(hidden_states[:, slice_indices, :])
elif (() == () and () == ()) and (UNSLOTH_ENABLE_CCE) and NOT_RETURN_LOGITS and self.loss_function.__name__.endswith("ForCausalLMLoss") and labels is not None and not requires_grad_:
loss = fused_linear_cross_entropy(
hidden_states = hidden_states[:, slice_indices, :],
lm_weight = self.lm_head.weight,
labels = labels.to(self.lm_head.weight.device),
num_items_in_batch = n_items,
logit_softcapping = None if (self.config.final_logit_softcapping) == () else (self.config.final_logit_softcapping),
)
elif self.loss_function.__name__.endswith("ForCausalLMLoss") and labels is not None:
lm_head_weight = self.lm_head.weight
lm_head_bias = getattr(self.lm_head, "bias", None)
# ========= NEW fused =========
_hidden_states = hidden_states[:, slice_indices, :]
torch._dynamo.mark_dynamic(_hidden_states, 1)
torch._dynamo.mark_dynamic(labels, 1)
loss = unsloth_fused_ce_loss(
trainer = None,
hidden_states = _hidden_states,
lm_head_weight = lm_head_weight,
lm_head_bias = lm_head_bias,
labels = labels,
mask = None,
n_items = n_items,
scaling = getattr(self, "accelerator_scaler", None),
target_gb = None,
torch_compile = not UNSLOTH_COMPILE_DISABLE,
logit_scale_multiply = () if () != () else 0,
logit_scale_divide = () if () != () else 0,
logit_softcapping = (self.config.final_logit_softcapping) if (self.config.final_logit_softcapping) != () else 0,
)
else:
logits = self.lm_head(hidden_states[:, slice_indices, :])
if () != ():
logits = logits * ()
if () != ():
logits = logits / ()
if (self.config.final_logit_softcapping) not in (None, (),):
logits = logits / (self.config.final_logit_softcapping)
logits = torch.tanh(logits)
logits = logits * (self.config.final_logit_softcapping)
loss = self.loss_function(logits, labels.to(self.lm_head.weight.device), vocab_size=self.vocab_size, **kwargs)
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
config: Gemma3TextConfig
base_model_prefix = "language_model"
def __init__(self, config: Gemma3TextConfig):
super().__init__(config)
self.model = Gemma3TextModel(config)
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
) -> CausalLMOutputWithPast:
return Gemma3ForCausalLM_forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, cache_position, logits_to_keep, **kwargs)
def token_type_ids_mask_function(
token_type_ids: Optional[torch.Tensor],
image_group_ids: Optional[torch.Tensor],
tokens_per_image: int,
) -> Optional[Callable]:
"""
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
"""
# Do not return an additional mask in this case
if token_type_ids is None:
return None
def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
# If it's 1 for both query and key/value, we are in an image block
# NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
# Since vmap doesn't support `if statement` we workaround it with `torch.where`
safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)
is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx
# This is bidirectional attention whenever we are dealing with image tokens
return is_image_block & same_image_block
return inner_mask
@torch.compiler.disable(recursive = False)
def Gemma3ForConditionalGeneration_forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
token_type_ids: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**lm_kwargs,
) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
Example:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
>>> messages = [
... {
... "role": "system",
... "content": [
... {"type": "text", "text": "You are a helpful assistant."}
... ]
... },
... {
... "role": "user", "content": [
... {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
... {"type": "text", "text": "Where is the cat standing?"},
... ]
... },
... ]
>>> inputs = processor.apply_chat_template(
... messages,
... tokenize=True,
... return_dict=True,
... return_tensors="pt",
... add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
pixel_values=pixel_values,
token_type_ids=token_type_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
labels=mask_attention_mask_out(labels = labels, attention_mask = attention_mask),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
**lm_kwargs,
)
hidden_states = outputs[0]
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
logits = self.lm_head(hidden_states[:, slice_indices, :]) if os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '1' else EMPTY_LOGITS
loss = None
NOT_RETURN_LOGITS = os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '0'
RETURN_HIDDEN_STATES = os.environ.get("UNSLOTH_RETURN_HIDDEN_STATES", "0") == "1"
all_locals = locals()
n_items = None
if 'loss_kwargs' in all_locals:
__kwargs = all_locals['loss_kwargs']
if type(__kwargs) is dict:
n_items = __kwargs.get("num_items_in_batch", None)
if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None and 'kwargs' in all_locals:
__kwargs = all_locals['kwargs']
if type(__kwargs) is dict:
n_items = __kwargs.get("num_items_in_batch", None)
if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None:
all_locals = all_locals.values()
for __kwargs in all_locals:
if type(__kwargs) is dict:
n_items = __kwargs.get("num_items_in_batch", None)
if n_items is None: n_items = __kwargs.get("n_items", None)
break
pass
requires_grad_ = self.lm_head.weight.requires_grad
requires_grad_ = requires_grad_ or self.lm_head.weight.dtype == torch.float32
if RETURN_HIDDEN_STATES:
logits = hidden_states[:, slice_indices, :]
elif labels is None:
# Set compiler stance to fail on recompiles for inference
global INFERENCE_RUNS
if torch_dynamo_eval_frame is not None:
old_stance = torch_dynamo_eval_frame._stance.stance
else:
old_stance = None
if old_stance is not None and INFERENCE_RUNS == 1:
# Skip guards and return to eager -> we still need guards!
torch_compiler_set_stance(stance = "eager_on_recompile", skip_guard_eval_unsafe = False)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.info(
f"Unsloth: Removing compiler guards after 1 inference run. " \
f"DYNAMO_STANCE.stance = {torch_dynamo_eval_frame._stance.stance} " \
f"DYNAMO_STANCE.skip_guard_eval_unsafe = {torch_dynamo_eval_frame._stance.skip_guard_eval_unsafe}"
)
elif old_stance == "eager_on_recompile":
pass
elif old_stance == "default" and INFERENCE_RUNS > 1:
# Reset compiler stance
torch_compiler_set_stance(stance = "default", skip_guard_eval_unsafe = False)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.info(
f"Unsloth: Reseting guards. " \
f"DYNAMO_STANCE.stance = {torch_dynamo_eval_frame._stance.stance} " \
f"DYNAMO_STANCE.skip_guard_eval_unsafe = {torch_dynamo_eval_frame._stance.skip_guard_eval_unsafe}"
)
INFERENCE_RUNS = 0
INFERENCE_RUNS += 1
logits = self.lm_head(hidden_states[:, slice_indices, :])
else:
lm_head_weight = self.lm_head.weight
lm_head_bias = getattr(self.lm_head, "bias", None)
# ========= NEW fused =========
_hidden_states = hidden_states[:, slice_indices, :]
torch._dynamo.mark_dynamic(_hidden_states, 1)
torch._dynamo.mark_dynamic(labels, 1)
if attention_mask is not None:
torch._dynamo.mark_dynamic(attention_mask, 1)
loss = unsloth_fused_ce_loss(
trainer = None,
hidden_states = _hidden_states,
lm_head_weight = lm_head_weight,
lm_head_bias = lm_head_bias,
labels = labels,
mask = attention_mask,
n_items = n_items,
scaling = getattr(self, "accelerator_scaler", None),
target_gb = None,
torch_compile = not UNSLOTH_COMPILE_DISABLE,
logit_scale_multiply = () if () != () else 0,
logit_scale_divide = () if () != () else 0,
logit_softcapping = () if () != () else 0,
)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return Gemma3CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=outputs.image_hidden_states,
)
class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
_checkpoint_conversion_mapping = {
"^language_model.model": "model.language_model",
"^vision_tower": "model.vision_tower",
"^multi_modal_projector": "model.multi_modal_projector",
"^language_model.lm_head": "lm_head",
}
_tied_weights_keys = ["lm_head.weight"]
# we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
# Fix: https://github.com/huggingface/transformers/issues/40564
accepts_loss_kwargs = False
def __init__(self, config: Gemma3Config):
super().__init__(config)
self.model = Gemma3Model(config)
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.get_input_embeddings()
def set_input_embeddings(self, value):
self.model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model.get_decoder()
def get_image_features(self, pixel_values):
return self.model.get_image_features(pixel_values)
# Make modules available through conditional class for BC
@property
def language_model(self):
return self.model.language_model
@property
def vision_tower(self):
return self.model.vision_tower
@property
def multi_modal_projector(self):
return self.model.multi_modal_projector
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
token_type_ids: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**lm_kwargs,
) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
return Gemma3ForConditionalGeneration_forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, token_type_ids, cache_position, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, logits_to_keep, **lm_kwargs)
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
inputs_embeds=None,
cache_position=None,
position_ids=None,
pixel_values=None,
attention_mask=None,
token_type_ids=None,
use_cache=True,
logits_to_keep=None,
labels=None,
**kwargs,
):
# Overwritten -- custom `position_ids` and `pixel_values` handling
model_inputs = super().prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
cache_position=cache_position,
use_cache=use_cache,
logits_to_keep=logits_to_keep,
token_type_ids=token_type_ids,
**kwargs,
)
# If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
# Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
if cache_position[0] == 0:
model_inputs["pixel_values"] = pixel_values
return model_inputs
@staticmethod
def create_masks_for_generate(
config: PretrainedConfig,
input_embeds: torch.Tensor,
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor],
token_type_ids: Optional[torch.Tensor] = None,
**kwargs,
) -> dict:
# Prepare mask arguments
mask_kwargs = {
"config": config.get_text_config(),
"input_embeds": input_embeds,
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"position_ids": position_ids,
}
# Add the token type ids mask for generate as well
if token_type_ids is not None and input_embeds.shape[1] != 1:
# We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
# First find where a new image block starts: 1 if image and previous not image
# The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
is_image = (token_type_ids == 1).to(cache_position.device)
new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
token_type_ids.to(cache_position.device), image_group_ids, config.mm_tokens_per_image
)
return create_masks_for_generate(**mask_kwargs)
if hasattr(logger, "addFilter"):
import logging
class HideLoggingMessage(logging.Filter):
def __init__(self, text): self.text = text
def filter(self, x): return not (self.text in x.getMessage())
pass
logger.addFilter(HideLoggingMessage("`use_cache=True`"))

View File

@@ -0,0 +1,535 @@
"""
2026.2.1
2026.2.1
4.57.6
0.24.0
__UNSLOTH_VERSIONING__
"""
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
UNSLOTH_STUDIO_ENABLED = False
else:
UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
logger_compiler.setLevel(logging.DEBUG)
global INFERENCE_RUNS
INFERENCE_RUNS = 0
try:
import torch._dynamo.eval_frame as torch_dynamo_eval_frame
torch_dynamo_eval_frame._stance.stance
torch_compiler_set_stance = torch.compiler.set_stance
except:
torch_dynamo_eval_frame = None
torch_compiler_set_stance = None
pass
from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
from unsloth_zoo.loss_utils import (
fused_linear_cross_entropy,
unsloth_fused_ce_loss,
)
if UNSLOTH_STUDIO_ENABLED:
from unsloth_zoo.loss_utils import fast_linear_cross_entropy
scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
@torch.compiler.disable(recursive = False)
def disable_compile_scaled_dot_product_attention(*args, **kwargs):
return scaled_dot_product_attention(*args, **kwargs)
pass
from transformers.modeling_flash_attention_utils import is_flash_attn_available
if is_flash_attn_available():
try:
from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask
except:
flash_attn_supports_top_left_mask = None
try:
from transformers.modeling_flash_attention_utils import _flash_attention_forward
except:
_flash_attention_forward = None
try:
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
except:
FlashAttentionKwargs = None
try:
from transformers.modeling_flash_attention_utils import flash_attn_varlen_func
except:
flash_attn_varlen_func = None
else:
flash_attn_supports_top_left_mask = None
_flash_attention_forward = None
FlashAttentionKwargs = None
flash_attn_varlen_func = None
pass
torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
from torch.nn import CrossEntropyLoss
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def normal_cross_entropy_loss(self, hidden_states, labels):
logits = self.lm_head(hidden_states)
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
return loss, logits
pass
# We need an empty logits flag to warn people logits will not be returned anymore unless asked ie
# os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
LOGITS_ERROR_STRING = \
"Unsloth: Logits are empty from 2024.11 onwards. To get raw logits again, please "\
'set the environment variable `UNSLOTH_RETURN_LOGITS` to `"1" BEFORE starting to train ie before `trainer.train()`. For example:\n'\
"```\nimport os\n"\
"os.environ['UNSLOTH_RETURN_LOGITS'] = '1'\n"\
"trainer.train()\n```\n"\
"No need to restart your console - just add `os.environ['UNSLOTH_RETURN_LOGITS'] = '1'` before trainer.train() and re-run the cell!"
def raise_logits_error(*args, **kwargs): raise NotImplementedError(LOGITS_ERROR_STRING)
def return_none(*args, **kwargs): return None
class EmptyLogits:
def __init__(self): return
def raise_getattr_error(self, attr): return return_none if attr == "to" else raise_logits_error
__getitem__ = raise_logits_error
__getattr__ = raise_getattr_error
def __repr__(self): return LOGITS_ERROR_STRING
def __str__ (self): return LOGITS_ERROR_STRING
pass
EMPTY_LOGITS = EmptyLogits()
functions = dir(torch.Tensor)
for j, function in enumerate(functions):
if function.startswith("__") and function.endswith("__"):
exec(f"def raise_{j}(*args, **kwargs): print('{function}')", globals(), locals())
try: exec(f"EMPTY_LOGITS.{function} = raise_{j}", globals(), locals())
except: continue
pass
def mask_attention_mask_out(labels = None, attention_mask = None):
if labels is not None and attention_mask is not None:
attention_mask = attention_mask.to(device = labels.device)
labels[attention_mask == 0] = -100
return labels
pass
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
from unsloth_zoo.temporary_patches.common import torch_compile
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
from transformers.models.siglip.modeling_siglip import (math, warnings, Callable, Optional, np, torch, nn, _calculate_fan_in_and_fan_out, ACT2FN, ALL_ATTENTION_FUNCTIONS, torch_int, SiglipTextConfig, SiglipVisionConfig)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def _trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2,
)
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.0))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def trunc_normal_tf_(
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
) -> torch.Tensor:
"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsequently scaled and shifted by the mean and std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
"""
with torch.no_grad():
_trunc_normal_(tensor, 0, 1.0, a, b)
tensor.mul_(std).add_(mean)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
if mode == "fan_in":
denom = fan_in
elif mode == "fan_out":
denom = fan_out
elif mode == "fan_avg":
denom = (fan_in + fan_out) / 2
variance = scale / denom
if distribution == "truncated_normal":
# constant is stddev of standard normal truncated to (-2, 2)
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
elif distribution == "normal":
with torch.no_grad():
tensor.normal_(std=math.sqrt(variance))
elif distribution == "uniform":
bound = math.sqrt(3 * variance)
with torch.no_grad():
tensor.uniform_(-bound, bound)
else:
raise ValueError(f"invalid distribution {distribution}")
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def lecun_normal_(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def default_flax_embed_init(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="normal")
@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def SiglipVisionEmbeddings_forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
_, _, height, width = pixel_values.shape
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
embeddings = patch_embeds.flatten(2).transpose(1, 2)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
class SiglipVisionEmbeddings(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
padding="valid",
)
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1]
num_positions = self.position_embedding.weight.shape[0]
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embedding(self.position_ids)
patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
dim = embeddings.shape[-1]
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return patch_pos_embed
def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
return SiglipVisionEmbeddings_forward(self, pixel_values, interpolate_pos_encoding)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def SiglipTextEmbeddings_forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
max_position_embedding = self.position_embedding.weight.shape[0]
if seq_length > max_position_embedding:
raise ValueError(
f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
f"{seq_length} and max_position_embeddings: {max_position_embedding}"
)
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings
return embeddings
class SiglipTextEmbeddings(nn.Module):
def __init__(self, config: SiglipTextConfig):
super().__init__()
embed_dim = config.hidden_size
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
return SiglipTextEmbeddings_forward(self, input_ids, position_ids, inputs_embeds)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.Tensor],
scaling: float,
dropout: float = 0.0,
**kwargs,
):
attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype = torch.float32).to(attn_weights.dtype).to(query.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
attn_output = torch.matmul(attn_weights, value)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output, attn_weights
@torch.compiler.disable(recursive = False)
def SiglipAttention_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel"""
batch_size, seq_length, embed_dim = hidden_states.shape
queries = self.q_proj(hidden_states)
keys = self.k_proj(hidden_states)
values = self.v_proj(hidden_states)
queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
queries,
keys,
values,
attention_mask,
is_causal=self.is_causal,
scaling=self.scale,
dropout=0.0 if not self.training else self.dropout,
)
attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights
class SiglipAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.is_causal = False
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
return SiglipAttention_forward(self, hidden_states, attention_mask, **kwargs)
@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def SiglipMLP_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class SiglipMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return SiglipMLP_forward(self, hidden_states)
@torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def SiglipMultiheadAttentionPoolingHead_forward(self, hidden_state):
batch_size = hidden_state.shape[0]
probe = self.probe.repeat(batch_size, 1, 1)
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
residual = hidden_state
hidden_state = self.layernorm(hidden_state)
hidden_state = residual + self.mlp(hidden_state)
return hidden_state[:, 0]
class SiglipMultiheadAttentionPoolingHead(nn.Module):
"""Multihead Attention Pooling."""
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
def forward(self, hidden_state):
return SiglipMultiheadAttentionPoolingHead_forward(self, hidden_state)