diff --git a/commands/package.go b/commands/package.go
index 3e3cec02..d9a41c6b 100644
--- a/commands/package.go
+++ b/commands/package.go
@@ -74,6 +74,7 @@ func newPackagedCmd() *cobra.Command {
 	}
 
 	c.Flags().StringVar(&opts.ggufPath, "gguf", "", "absolute path to gguf file (required)")
+	c.Flags().StringVar(&opts.chatTemplatePath, "chat-template", "", "absolute path to chat template file (must be Jinja format)")
 	c.Flags().StringArrayVarP(&opts.licensePaths, "license", "l", nil, "absolute path to a license file")
 	c.Flags().BoolVar(&opts.push, "push", false, "push to registry (if not set, the model is loaded into the Model Runner content store)")
 	c.Flags().Uint64Var(&opts.contextSize, "context-size", 0, "context size in tokens")
@@ -81,11 +82,12 @@ func newPackagedCmd() *cobra.Command {
 }
 
 type packageOptions struct {
-	ggufPath     string
-	licensePaths []string
-	push         bool
-	contextSize  uint64
-	tag          string
+	chatTemplatePath string
+	contextSize      uint64
+	ggufPath         string
+	licensePaths     []string
+	push             bool
+	tag              string
 }
 
 func packageModel(cmd *cobra.Command, opts packageOptions) error {
@@ -126,6 +128,13 @@ func packageModel(cmd *cobra.Command, opts packageOptions) error {
 		}
 	}
 
+	if opts.chatTemplatePath != "" {
+		cmd.PrintErrf("Adding chat template file from %q\n", opts.chatTemplatePath)
+		if pkg, err = pkg.WithChatTemplateFile(opts.chatTemplatePath); err != nil {
+			return fmt.Errorf("add chat template file from path %q: %w", opts.chatTemplatePath, err)
+		}
+	}
+
 	if opts.push {
 		cmd.PrintErrln("Pushing model to registry...")
 	} else {
diff --git a/docs/reference/docker_model_package.yaml b/docs/reference/docker_model_package.yaml
index 36fbc388..712a9480 100644
--- a/docs/reference/docker_model_package.yaml
+++ b/docs/reference/docker_model_package.yaml
@@ -8,6 +8,15 @@ usage: docker model package --gguf <path> [--license <path>...] [--context-size
 pname: docker model
 plink: docker_model.yaml
 options:
+    - option: chat-template
+      value_type: string
+      description: absolute path to chat template file (must be Jinja format)
+      deprecated: false
+      hidden: false
+      experimental: false
+      experimentalcli: false
+      kubernetes: false
+      swarm: false
     - option: context-size
       value_type: uint64
       default_value: "0"
diff --git a/docs/reference/model_package.md b/docs/reference/model_package.md
index 62dc7d89..a0448f79 100644
--- a/docs/reference/model_package.md
+++ b/docs/reference/model_package.md
@@ -8,6 +8,7 @@ When packaging a sharded model --gguf should point to the first shard. All shard
 
 | Name              | Type          | Default | Description                                                                            |
 |:------------------|:--------------|:--------|:---------------------------------------------------------------------------------------|
+| `--chat-template` | `string`      |         | absolute path to chat template file (must be Jinja format)                             |
 | `--context-size`  | `uint64`      | `0`     | context size in tokens                                                                 |
 | `--gguf`          | `string`      |         | absolute path to gguf file (required)                                                  |
 | `-l`, `--license` | `stringArray` |         | absolute path to a license file                                                        |
diff --git a/go.mod b/go.mod
index 6ac2df9c..0719d66e 100644
--- a/go.mod
+++ b/go.mod
@@ -11,7 +11,7 @@ require (
 	github.com/docker/docker v28.2.2+incompatible
 	github.com/docker/go-connections v0.5.0
 	github.com/docker/go-units v0.5.0
-	github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058
+	github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57
 	github.com/docker/model-runner v0.0.0-20250911130340-38bb0171c947
 	github.com/fatih/color v1.15.0
 	github.com/google/go-containerregistry v0.20.6
@@ -55,7 +55,7 @@ require (
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/gorilla/mux v1.8.1 // indirect
-	github.com/gpustack/gguf-parser-go v0.14.1 // indirect
+	github.com/gpustack/gguf-parser-go v0.22.1 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1 // indirect
 	github.com/henvic/httpretty v0.1.4 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
@@ -84,7 +84,6 @@ require (
 	github.com/prometheus/common v0.65.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
 	github.com/rivo/uniseg v0.4.7 // indirect
-	github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/sirupsen/logrus v1.9.3 // indirect
 	github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
diff --git a/go.sum b/go.sum
index 98238031..7f8150d5 100644
--- a/go.sum
+++ b/go.sum
@@ -80,8 +80,8 @@ github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDD
 github.com/docker/go-winjob v0.0.0-20250829235554-57b487ebcbc5 h1:dxSFEb0EEmvceIawSFNDMrvKakRz2t+2WYpY3dFAT04=
 github.com/docker/go-winjob v0.0.0-20250829235554-57b487ebcbc5/go.mod h1:ICOGmIXdwhfid7rQP+tLvDJqVg0lHdEk3pI5nsapTtg=
 github.com/docker/libtrust v0.0.0-20160708172513-aabc10ec26b7/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE=
-github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058 h1:whffgQ1pmiMFVrxRhJKA9yyCJXvmVX6iiohU9ezKCx0=
-github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c=
+github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57 h1:WHiPO9UmO5v97T3ksQUA2SbYVkTdUCSFobznegL97kk=
+github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57/go.mod h1:bV1RH2e79nTwOW38GoMU9UO8gpZVLH9+cZeEeR4wSeE=
 github.com/docker/model-runner v0.0.0-20250911130340-38bb0171c947 h1:6Dz1SFZONEd8tlKetn2Gu6v5HDJI/YtUFwkqHGwrsV0=
 github.com/docker/model-runner v0.0.0-20250911130340-38bb0171c947/go.mod h1:cl7panafjkSHllYCCGYAzty2aUvbwk55Gi35v06XL80=
 github.com/dvsekhvalnov/jose2go v0.0.0-20170216131308-f21a8cedbbae/go.mod h1:7BvyPhdbLxMXIYTFPLsyJRFMsKmOZnQmzh6Gb+uquuM=
@@ -130,8 +130,8 @@ github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+
 github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
 github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
 github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
-github.com/gpustack/gguf-parser-go v0.14.1 h1:tmz2eTnSEFfE52V10FESqo9oAUquZ6JKQFntWC/wrEg=
-github.com/gpustack/gguf-parser-go v0.14.1/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo=
+github.com/gpustack/gguf-parser-go v0.22.1 h1:FRnEDWqT0Rcplr/R9ctCRSN2+3DhVsf6dnR5/i9JA4E=
+github.com/gpustack/gguf-parser-go v0.22.1/go.mod h1:y4TwTtDqFWTK+xvprOjRUh+dowgU2TKCX37vRKvGiZ0=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1 h1:e9Rjr40Z98/clHv5Yg79Is0NtosR5LXRvdr7o/6NwbA=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1/go.mod h1:tIxuGz/9mpox++sgp9fJjHO0+q1X9/UOWd798aAm22M=
 github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4=
@@ -243,8 +243,6 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
-github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8=
-github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/sirupsen/logrus v1.0.6/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc=
diff --git a/vendor/github.com/docker/model-distribution/builder/builder.go b/vendor/github.com/docker/model-distribution/builder/builder.go
index 871e462a..659d1e8e 100644
--- a/vendor/github.com/docker/model-distribution/builder/builder.go
+++ b/vendor/github.com/docker/model-distribution/builder/builder.go
@@ -55,6 +55,17 @@ func (b *Builder) WithMultimodalProjector(path string) (*Builder, error) {
 	}, nil
 }
 
+// WithChatTemplateFile adds a Jinja chat template file to the artifact which takes precedence over template from GGUF.
+func (b *Builder) WithChatTemplateFile(path string) (*Builder, error) {
+	templateLayer, err := partial.NewLayer(path, types.MediaTypeChatTemplate)
+	if err != nil {
+		return nil, fmt.Errorf("chat template layer from %q: %w", path, err)
+	}
+	return &Builder{
+		model: mutate.AppendLayers(b.model, templateLayer),
+	}, nil
+}
+
 // Target represents a build target
 type Target interface {
 	Write(context.Context, types.ModelArtifact, io.Writer) error
diff --git a/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go b/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go
index a32b8031..5476fc55 100644
--- a/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go
+++ b/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go
@@ -8,10 +8,11 @@ import (
 
 // Bundle represents a runtime bundle containing a model and runtime config
 type Bundle struct {
-	dir           string
-	mmprojPath    string
-	ggufFile      string // path to GGUF file (first shard when model is split among files)
-	runtimeConfig types.Config
+	dir              string
+	mmprojPath       string
+	ggufFile         string // path to GGUF file (first shard when model is split among files)
+	runtimeConfig    types.Config
+	chatTemplatePath string
 }
 
 // RootDir return the path to the bundle root directory
@@ -36,6 +37,14 @@ func (b *Bundle) MMPROJPath() string {
 	return filepath.Join(b.dir, b.mmprojPath)
 }
 
+// ChatTemplatePath return the path to a Jinja chat template file or "" if none is present.
+func (b *Bundle) ChatTemplatePath() string {
+	if b.chatTemplatePath == "" {
+		return ""
+	}
+	return filepath.Join(b.dir, b.chatTemplatePath)
+}
+
 // RuntimeConfig returns config that should be respected by the backend at runtime.
 func (b *Bundle) RuntimeConfig() types.Config {
 	return b.runtimeConfig
diff --git a/vendor/github.com/docker/model-distribution/internal/bundle/parse.go b/vendor/github.com/docker/model-distribution/internal/bundle/parse.go
index 016254cb..93912dac 100644
--- a/vendor/github.com/docker/model-distribution/internal/bundle/parse.go
+++ b/vendor/github.com/docker/model-distribution/internal/bundle/parse.go
@@ -22,15 +22,20 @@ func Parse(rootDir string) (*Bundle, error) {
 	if err != nil {
 		return nil, err
 	}
+	templatePath, err := findChatTemplateFile(rootDir)
+	if err != nil {
+		return nil, err
+	}
 	cfg, err := parseRuntimeConfig(rootDir)
 	if err != nil {
 		return nil, err
 	}
 	return &Bundle{
-		dir:           rootDir,
-		mmprojPath:    mmprojPath,
-		ggufFile:      ggufPath,
-		runtimeConfig: cfg,
+		dir:              rootDir,
+		mmprojPath:       mmprojPath,
+		ggufFile:         ggufPath,
+		runtimeConfig:    cfg,
+		chatTemplatePath: templatePath,
 	}, nil
 }
 
@@ -71,3 +76,17 @@ func findMultiModalProjectorFile(rootDir string) (string, error) {
 	}
 	return filepath.Base(mmprojPaths[0]), nil
 }
+
+func findChatTemplateFile(rootDir string) (string, error) {
+	templatePaths, err := filepath.Glob(filepath.Join(rootDir, "[^.]*.jinja"))
+	if err != nil {
+		return "", err
+	}
+	if len(templatePaths) == 0 {
+		return "", nil
+	}
+	if len(templatePaths) > 1 {
+		return "", fmt.Errorf("found multiple template files, but only 1 is supported")
+	}
+	return filepath.Base(templatePaths[0]), nil
+}
diff --git a/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go b/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go
index 5fe6a23e..f44069e5 100644
--- a/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go
+++ b/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go
@@ -20,6 +20,9 @@ func Unpack(dir string, model types.Model) (*Bundle, error) {
 	if err := unpackMultiModalProjector(bundle, model); err != nil {
 		return nil, fmt.Errorf("add multi-model projector file to runtime bundle: %w", err)
 	}
+	if err := unpackTemplate(bundle, model); err != nil {
+		return nil, fmt.Errorf("add chat template file to runtime bundle: %w", err)
+	}
 	if err := unpackRuntimeConfig(bundle, model); err != nil {
 		return nil, fmt.Errorf("add config.json to runtime bundle: %w", err)
 	}
@@ -80,6 +83,18 @@ func unpackMultiModalProjector(bundle *Bundle, mdl types.Model) error {
 	return nil
 }
 
+func unpackTemplate(bundle *Bundle, mdl types.Model) error {
+	path, err := mdl.ChatTemplatePath()
+	if err != nil {
+		return nil // no such file
+	}
+	if err = unpackFile(filepath.Join(bundle.dir, "template.jinja"), path); err != nil {
+		return err
+	}
+	bundle.chatTemplatePath = "template.jinja"
+	return nil
+}
+
 func unpackFile(bundlePath string, srcPath string) error {
 	return os.Link(srcPath, bundlePath)
 }
diff --git a/vendor/github.com/docker/model-distribution/internal/partial/partial.go b/vendor/github.com/docker/model-distribution/internal/partial/partial.go
index 7367556c..8d6c3a27 100644
--- a/vendor/github.com/docker/model-distribution/internal/partial/partial.go
+++ b/vendor/github.com/docker/model-distribution/internal/partial/partial.go
@@ -84,6 +84,21 @@ func MMPROJPath(i WithLayers) (string, error) {
 	return paths[0], err
 }
 
+func ChatTemplatePath(i WithLayers) (string, error) {
+	paths, err := layerPathsByMediaType(i, types.MediaTypeChatTemplate)
+	if err != nil {
+		return "", fmt.Errorf("get chat template layer paths: %w", err)
+	}
+	if len(paths) == 0 {
+		return "", fmt.Errorf("model does not contain any layer of type %q", types.MediaTypeChatTemplate)
+	}
+	if len(paths) > 1 {
+		return "", fmt.Errorf("found %d files of type %q, expected exactly 1",
+			len(paths), types.MediaTypeChatTemplate)
+	}
+	return paths[0], err
+}
+
 // layerPathsByMediaType is a generic helper function that finds a layer by media type and returns its path
 func layerPathsByMediaType(i WithLayers, mediaType ggcr.MediaType) ([]string, error) {
 	layers, err := i.Layers()
diff --git a/vendor/github.com/docker/model-distribution/internal/store/model.go b/vendor/github.com/docker/model-distribution/internal/store/model.go
index b35539a6..bd3a4fa0 100644
--- a/vendor/github.com/docker/model-distribution/internal/store/model.go
+++ b/vendor/github.com/docker/model-distribution/internal/store/model.go
@@ -118,6 +118,10 @@ func (m *Model) MMPROJPath() (string, error) {
 	return mdpartial.MMPROJPath(m)
 }
 
+func (m *Model) ChatTemplatePath() (string, error) {
+	return mdpartial.ChatTemplatePath(m)
+}
+
 func (m *Model) Tags() []string {
 	return m.tags
 }
diff --git a/vendor/github.com/docker/model-distribution/types/config.go b/vendor/github.com/docker/model-distribution/types/config.go
index 8211dd2a..0261a9f9 100644
--- a/vendor/github.com/docker/model-distribution/types/config.go
+++ b/vendor/github.com/docker/model-distribution/types/config.go
@@ -23,6 +23,9 @@ const (
 	// MediaTypeMultimodalProjector indicates a Multimodal projector file
 	MediaTypeMultimodalProjector = types.MediaType("application/vnd.docker.ai.mmproj")
 
+	// MediaTypeChatTemplate indicates a Jinja chat template
+	MediaTypeChatTemplate = types.MediaType("application/vnd.docker.ai.chat.template.jinja")
+
 	FormatGGUF = Format("gguf")
 )
 
diff --git a/vendor/github.com/docker/model-distribution/types/model.go b/vendor/github.com/docker/model-distribution/types/model.go
index 62374c02..7f9ba394 100644
--- a/vendor/github.com/docker/model-distribution/types/model.go
+++ b/vendor/github.com/docker/model-distribution/types/model.go
@@ -11,6 +11,7 @@ type Model interface {
 	Config() (Config, error)
 	Tags() []string
 	Descriptor() (Descriptor, error)
+	ChatTemplatePath() (string, error)
 }
 
 type ModelArtifact interface {
@@ -23,6 +24,7 @@ type ModelArtifact interface {
 type ModelBundle interface {
 	RootDir() string
 	GGUFPath() string
+	ChatTemplatePath() string
 	MMPROJPath() string
 	RuntimeConfig() Config
 }
diff --git a/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml b/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml
index 480355ee..f514dad7 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml
+++ b/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml
@@ -1,3 +1,5 @@
+version: "1"
+
 run:
   timeout: 10m
   tests: true
@@ -8,7 +10,6 @@ run:
 output:
   print-issued-lines: true
   print-linter-name: true
-  uniq-by-line: true
   path-prefix: ""
   sort-results: true
 
@@ -22,7 +23,7 @@ linters:
     - errcheck
     - errname
     - errorlint
-    - exportloopref
+    - copyloopvar
     - godot
     - goconst
     - gocritic
@@ -83,6 +84,7 @@ linters-settings:
       - G101
       - G107
       - G112
+      - G115
       - G404
   gofumpt:
     extra-rules: true
@@ -119,7 +121,6 @@ linters-settings:
   unused:
     field-writes-are-uses: true
     post-statements-are-reads: true
-    exported-is-used: true
     exported-fields-are-used: true
     parameters-are-used: true
     local-variables-are-used: true
@@ -133,6 +134,7 @@ linters-settings:
     crypto-hash: true
 
 issues:
+  uniq-by-line: true
   exclude-files:
     - "doc.go"
     - "zz_generated.*.go"
diff --git a/vendor/github.com/gpustack/gguf-parser-go/Makefile b/vendor/github.com/gpustack/gguf-parser-go/Makefile
index 1eea47d6..2834fb6c 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/Makefile
+++ b/vendor/github.com/gpustack/gguf-parser-go/Makefile
@@ -34,33 +34,33 @@ generate:
 lint:
 	@echo "+++ $@ +++"
 
-	if [[ "$(LINT_DIRTY)" == "true" ]]; then \
-  		if [[ -n $$(git status --porcelain) ]]; then \
-  			echo "Code tree is dirty."; \
-  			git diff --exit-code; \
-  		fi; \
-	fi
-
 	[[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"
 
 	[[ -f "$(SRCDIR)/.sbin/goimports-reviser" ]] || \
-		curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.6.5/goimports-reviser_3.6.5_$(GOOS)_$(GOARCH).tar.gz" \
+		curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.8.2/goimports-reviser_3.8.2_$(GOOS)_$(GOARCH).tar.gz" \
 		| tar -zxvf - --directory "$(SRCDIR)/.sbin" --no-same-owner --exclude ./LICENSE --exclude ./README.md && chmod +x "$(SRCDIR)/.sbin/goimports-reviser"
 	cd $(SRCDIR) && \
 		go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \
-		| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {}
+		| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1
 	cd $(SRCDIR)/cmd/gguf-parser && \
 		go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \
-		| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {}
+		| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1
 
 	[[ -f "$(SRCDIR)/.sbin/golangci-lint" ]] || \
 		curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \
-		| sh -s -- -b "$(SRCDIR)/.sbin" "v1.59.0"
+		| sh -s -- -b "$(SRCDIR)/.sbin" "v1.63.4"
 	cd $(SRCDIR) && \
 		"$(SRCDIR)/.sbin/golangci-lint" run --fix ./...
 	cd $(SRCDIR)/cmd/gguf-parser && \
 		"$(SRCDIR)/.sbin/golangci-lint" run --fix ./...
 
+	if [[ "$(LINT_DIRTY)" == "true" ]]; then \
+		if [[ -n $$(git status --porcelain) ]]; then \
+			echo "Code tree is dirty."; \
+			git diff --exit-code; \
+		fi; \
+	fi
+
 	@echo "--- $@ ---"
 
 test:
@@ -99,7 +99,7 @@ gguf-parser:
 		if [[ $$os == "darwin" ]]; then \
 		  [[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"; \
 		  [[ -f "$(SRCDIR)/.sbin/lipo" ]] || \
-			GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.1; \
+			GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.2; \
 		  	"$(SRCDIR)/.sbin/lipo" -create -output $(SRCDIR)/.dist/gguf-parser-darwin-universal $(SRCDIR)/.dist/gguf-parser-darwin-amd64 $(SRCDIR)/.dist/gguf-parser-darwin-arm64; \
 		fi;\
 		if [[ $$os == "$(GOOS)" ]] && [[ $$arch == "$(GOARCH)" ]]; then \
@@ -126,7 +126,7 @@ package: build
 	fi; \
 	if [[ "$(PACKAGE_PUBLISH)" == "true" ]]; then \
 	  	if [[ -z $$(docker buildx inspect --builder "gguf-parser") ]]; then \
-      		docker run --rm --privileged tonistiigi/binfmt:qemu-v7.0.0 --install $$platform; \
+      		docker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2 --install $$platform; \
       		docker buildx create --name "gguf-parser" --driver "docker-container" --buildkitd-flags "--allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host" --bootstrap; \
       	fi; \
 		docker buildx build --progress=plain --platform=$$platform --builder="gguf-parser" --output="type=image,name=$$image,push=true" "$(SRCDIR)"; \
@@ -137,4 +137,4 @@ package: build
 
 	@echo "--- $@ ---"
 
-ci: deps generate test lint build
+ci: deps generate lint test build
diff --git a/vendor/github.com/gpustack/gguf-parser-go/README.md b/vendor/github.com/gpustack/gguf-parser-go/README.md
index 96c6804c..52d6f5f2 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/README.md
+++ b/vendor/github.com/gpustack/gguf-parser-go/README.md
@@ -56,6 +56,16 @@ download it.
 
 ## Notes
 
+- **Since v0.20.0**, GGUF Parser supports leveraging `--override-tensor` to indicate how to place the model tensors.
+- **Since v0.19.0**, GGUF Parser supports estimating Audio projector model file, like Ultravox series, Qwen2 Audio
+  series, etc.
+- **Since v0.18.0**, GGUF Parser supports estimating SWA-supported(sliding window attention) model file, like LLaMA 4
+  series, Gemma2/3 series, etc.
+- **Since v0.17.0**, GGUF Parser align the `QUANTIZATION`(
+  aka. [`general.file_type`](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#general-metadata))
+  to [HuggingFace processing](https://github.com/huggingface/huggingface.js/blob/2475d6d316135c0a4fceff6b3fe2aed0dde36ac1/packages/gguf/src/types.ts#L11-L48),
+  but there are still many model files whose naming does not fully follow `general.file_type`.
+- **Since v0.16.0**, GGUF Parser supports estimating MLA-supported model file, like DeepSeek series.
 - **Since v0.14.0 (BREAKING CHANGE)**, GGUF Parser parses `*.feed_forward_length` metadata as `[]uint64`,
   which means the architecture `feedForwardLength` is a list of integers.
 - **Since v0.13.0 (BREAKING CHANGE)**, GGUF Parser can parse files
@@ -93,21 +103,21 @@ Install from [releases](https://github.com/gpustack/gguf-parser-go/releases).
 
 ```shell
 $ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf
-+-------------------------------------------------------------------------------------------------------------+
-| METADATA                                                                                                    |
-+-------+-------------------------+-------+----------------+---------------+----------+------------+----------+
-|  TYPE |           NAME          |  ARCH |  QUANTIZATION  | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW   |
-+-------+-------------------------+-------+----------------+---------------+----------+------------+----------+
-| model | DeepSeek R1 Distill ... | qwen2 | IQ2_XXS/Q4_K_M |      true     | 4.36 GiB |   7.62 B   | 4.91 bpw |
-+-------+-------------------------+-------+----------------+---------------+----------+------------+----------+
-
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      131072     |      3584     |       7       |       true       |         28         |   28   |       18944      |      0     |     152064     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++-----------------------------------------------------------------------------------------------------------+
+| METADATA                                                                                                  |
++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
+|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW   |
++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
+| model | DeepSeek R1 Distill ... | qwen2 |    Q4_K_M    |      true     | 4.36 GiB |   7.62 B   | 4.91 bpw |
++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -117,15 +127,15 @@ $ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-
 |  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151646  |   151643  |    N/A    |    N/A    |      N/A      |       N/A       |     151654    |
 +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
 
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                      |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-------------------------------------+
-|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                VRAM 0               |
-|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+--------+-----------+
-|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+
-| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 654.26 MiB | 804.26 MiB |     28 + 1     |  7 GiB | 18.59 GiB |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+
++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                        |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
+|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
+|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
+| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 18.89 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
 
 $ # Retrieve the model's metadata via split file,
 $ # which needs all split files has been downloaded.
@@ -138,13 +148,13 @@ $  gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qw
 | model | qwen2.5-7b-instruct | qwen2 |     Q8_0     |      true     | 7.54 GiB |   7.62 B   | 8.50 bpw |
 +-------+---------------------+-------+--------------+---------------+----------+------------+----------+
 
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      131072     |      3584     |       7       |       true       |         28         |   28   |       18944      |      0     |     152064     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -154,29 +164,36 @@ $  gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qw
 |  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |
 +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
 
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                        |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
+|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
+|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
+| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 21.82 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
 ```
 
 #### Parse Remote File
 
 ```shell
 $ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/resolve/main/Qwen2.5-72B-Instruct-Q4_K_M.gguf"
-+-----------------------------------------------------------------------------------------------------------+
-| METADATA                                                                                                  |
-+-------+----------------------+-------+----------------+---------------+-----------+------------+----------+
-|  TYPE |         NAME         |  ARCH |  QUANTIZATION  | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
-+-------+----------------------+-------+----------------+---------------+-----------+------------+----------+
-| model | Qwen2.5 72B Instruct | qwen2 | IQ2_XXS/Q4_K_M |      true     | 44.15 GiB |   72.71 B  | 5.22 bpw |
-+-------+----------------------+-------+----------------+---------------+-----------+------------+----------+
-
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      32768      |      8192     |       8       |       true       |         64         |   80   |       29568      |      0     |     152064     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++---------------------------------------------------------------------------------------------------------+
+| METADATA                                                                                                |
++-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
+|  TYPE |         NAME         |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
++-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
+| model | Qwen2.5 72B Instruct | qwen2 |    Q4_K_M    |      true     | 44.15 GiB |   72.71 B  | 5.22 bpw |
++-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      32768      |      8192     |       true       |         64         |   80   |       29568      |      0     |     152064     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -186,15 +203,15 @@ $ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/
 |  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |
 +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
 
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                      |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-------------------------------------+
-|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                VRAM 0               |
-|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+--------+-----------+
-|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+
-| qwen2 |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 403.39 MiB | 553.39 MiB |     80 + 1     | 10 GiB | 57.87 GiB |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+
++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                         |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+----------------------------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                 |
+|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+-----------+-----------+
+|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA    |   NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+
+| qwen2 |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 426.57 MiB | 576.57 MiB |     80 + 1     | 10.31 GiB | 58.18 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+
 
 $ # Retrieve the model's metadata via split file
 
@@ -204,16 +221,16 @@ $ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/mai
 +-------+------------------+-----------+--------------+---------------+------------+------------+----------+
 |  TYPE |       NAME       |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE    | PARAMETERS |    BPW   |
 +-------+------------------+-----------+--------------+---------------+------------+------------+----------+
-| model | DeepSeek R1 BF16 | deepseek2 |     BF16     |      true     | 130.60 GiB |  671.03 B  | 1.67 bpw |
+| model | DeepSeek R1 BF16 | deepseek2 |     IQ1_S    |      true     | 130.60 GiB |  671.03 B  | 1.67 bpw |
 +-------+------------------+-----------+--------------+---------------+------------+------------+----------+
 
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      163840     |      7168     |       1       |       true       |         N/A        |   61   |       18432      |     256    |     129280     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      163840     |      7168     |       true       |         N/A        |   61   |       18432      |     256    |     129280     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -230,9 +247,8 @@ $ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/mai
 |           |              |                    |                 |           |                |             |               |                |                +--------------------+-----------+-----------+----------------+------------+--------+
 |           |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |     UMA    | NONUMA |
 +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+
-| deepseek2 |    163840    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   62 (61 + 1)  |       Yes      |      1 + 0 + 0     | 13.01 GiB | 13.16 GiB |     61 + 1     | 762.50 GiB |  1 TB  |
+| deepseek2 |    163840    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   62 (61 + 1)  |       Yes      |      1 + 0 + 0     | 13.03 GiB | 13.18 GiB |     61 + 1     | 762.76 GiB |  1 TB  |
 +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+
-
 ```
 
 #### Parse From HuggingFace
@@ -251,13 +267,13 @@ $ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-V
 | model | Qwen2 VL 2B Instruct | qwen2vl |      F16     |      true     | 2.88 GiB |   1.54 B   | 16.00 bpw |
 +-------+----------------------+---------+--------------+---------------+----------+------------+-----------+
 
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      32768      |      1536     |       6       |       true       |         12         |   28   |       8960       |      0     |     151936     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      32768      |      1536     |       true       |         12         |   28   |       8960       |      0     |     151936     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -274,7 +290,7 @@ $ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-V
 |         |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
 |         |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
 +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
-| qwen2vl |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 213.55 MiB | 363.55 MiB |     28 + 1     | 3.35 GiB | 12.60 GiB |
+| qwen2vl |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 236.87 MiB | 386.87 MiB |     28 + 1     | 3.65 GiB | 12.86 GiB |
 +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
 
 $ # Retrieve the model's metadata via split file
@@ -288,13 +304,13 @@ $ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf-
 | model | Openbuddy Llama3.3 7... | llama |     Q4_0     |      true     | 37.35 GiB |   70.55 B  | 4.55 bpw |
 +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
 
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      131072     |      8192     |       8       |       true       |         64         |   80   |       28672      |      0     |     128256     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -304,16 +320,15 @@ $ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf-
 |  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128048  |    N/A    |    N/A    |      N/A      |       N/A       |     128044    |
 +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
 
-+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                 |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+-------------------------------------+
-|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                VRAM 0               |
-|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+--------+-----------+
-|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |   UMA  |   NONUMA  |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+
-| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.04 GB | 1.11 GiB |     80 + 1     | 40 GiB | 93.36 GiB |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+
-
++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                    |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                 VRAM 0                 |
+|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+-----------+-----------+
+|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
+| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.06 GB | 1.13 GiB |     80 + 1     | 40.26 GiB | 93.62 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
 ```
 
 #### Parse From ModelScope
@@ -332,13 +347,13 @@ $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="De
 | model | DeepSeek R1 Distill ... | qwen2 |      F16     |      true     | 14.19 GiB |   7.62 B   | 16.00 bpw |
 +-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+
 
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      131072     |      3584     |       7       |       true       |         28         |   28   |       18944      |      0     |     152064     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -348,16 +363,15 @@ $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="De
 |  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151646  |   151643  |    N/A    |    N/A    |      N/A      |       N/A       |     151654    |
 +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
 
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                      |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-------------------------------------+
-|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                VRAM 0               |
-|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+--------+-----------+
-|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+
-| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 654.26 MiB | 804.26 MiB |     28 + 1     |  7 GiB | 27.69 GiB |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+
-
++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                        |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
+|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
+|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
+| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 27.99 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
 ```
 
 #### Parse From Ollama Library
@@ -368,21 +382,21 @@ $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="De
 
 ```shell
 $ gguf-parser --ol-model="llama3.3"
-+--------------------------------------------------------------------------------------------------------------+
-| METADATA                                                                                                     |
-+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+
-|  TYPE |           NAME          |  ARCH |  QUANTIZATION  | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
-+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+
-| model | Llama 3.1 70B Instru... | llama | IQ2_XXS/Q4_K_M |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |
-+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+
-
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      131072     |      8192     |       8       |       true       |         64         |   80   |       28672      |      0     |     128256     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++------------------------------------------------------------------------------------------------------------+
+| METADATA                                                                                                   |
++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+| model | Llama 3.1 70B Instru... | llama |    Q4_K_M    |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |
++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -392,34 +406,34 @@ $ gguf-parser --ol-model="llama3.3"
 |  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128009  |    N/A    |    N/A    |      N/A      |       N/A       |      N/A      |
 +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
 
-+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                 |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+-------------------------------------+
-|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                VRAM 0               |
-|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+--------+-----------+
-|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |   UMA  |   NONUMA  |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+
-| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.04 GB | 1.11 GiB |     80 + 1     | 40 GiB | 95.60 GiB |
-+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+
++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                    |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                 VRAM 0                 |
+|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+-----------+-----------+
+|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
+| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.06 GB | 1.13 GiB |     80 + 1     | 40.26 GiB | 95.86 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
 
 $ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters, 
 $ # you can get the usage of Ollama running by using `--ol-usage` option.
 
-$ +--------------------------------------------------------------------------------------------------------------+
-| METADATA                                                                                                     |
-+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+
-|  TYPE |           NAME          |  ARCH |  QUANTIZATION  | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
-+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+
-| model | Llama 3.1 70B Instru... | llama | IQ2_XXS/Q4_K_M |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |
-+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+
-
-+---------------------------------------------------------------------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                                                                                      |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
-|      131072     |      8192     |       8       |       true       |         64         |   80   |       28672      |      0     |     128256     |
-+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
++------------------------------------------------------------------------------------------------------------+
+| METADATA                                                                                                   |
++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+| model | Llama 3.1 70B Instru... | llama |    Q4_K_M    |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |
++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+
++-----------------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                                      |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |
++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
 
 +-------------------------------------------------------------------------------------------------------------------------------------------------------+
 | TOKENIZER                                                                                                                                             |
@@ -436,9 +450,8 @@ $ +-----------------------------------------------------------------------------
 |       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+------------+-----------+
 |       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |     UMA    |   NONUMA  |
 +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+
-| llama |     2048     |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 232.08 MiB | 382.08 MiB |     80 + 1     | 640.52 MiB | 40.23 GiB |
+| llama |     2048     |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 255.27 MiB | 405.27 MiB |     80 + 1     | 906.50 MiB | 40.49 GiB |
 +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+
-
 ```
 
 #### Others
@@ -456,13 +469,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 | model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
 +-------+------+-----------+--------------+---------------+-----------+------------+-----------+
 
-+-------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                        |
-+----------------+-------------------------------------------------+------------------+
-| DIFFUSION ARCH |                   CONDITIONERS                  |    AUTOENCODER   |
-+----------------+-------------------------------------------------+------------------+
-|     FLUX.1     | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) |
-+----------------+-------------------------------------------------+------------------+
++----------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                             |
++----------------+---------------------------------------------------------------+-------------------------+
+| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
++----------------+---------------------------------------------------------------+-------------------------+
+|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
++----------------+---------------------------------------------------------------+-------------------------+
 
 +---------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                  |
@@ -471,7 +484,7 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 |        |                 |             |               |                +------------+------------+-----------+-----------+
 |        |                 |             |               |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |
 +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
-| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 333.45 MiB | 483.45 MiB | 31.89 GiB | 41.15 GiB |
+| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 31.89 GiB | 41.15 GiB |
 +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
 
 $ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder
@@ -484,13 +497,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 | model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
 +-------+------+-----------+--------------+---------------+-----------+------------+-----------+
 
-+-------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                        |
-+----------------+-------------------------------------------------+------------------+
-| DIFFUSION ARCH |                   CONDITIONERS                  |    AUTOENCODER   |
-+----------------+-------------------------------------------------+------------------+
-|     FLUX.1     | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) |
-+----------------+-------------------------------------------------+------------------+
++----------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                             |
++----------------+---------------------------------------------------------------+-------------------------+
+| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
++----------------+---------------------------------------------------------------+-------------------------+
+|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
++----------------+---------------------------------------------------------------+-------------------------+
 
 +-------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                |
@@ -499,7 +512,7 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 |        |                 |             |               |                +-----------+-----------+-----------+-----------+
 |        |                 |             |               |                |    UMA    |   NONUMA  |    UMA    |   NONUMA  |
 +--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+
-| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 16.43 GiB | 16.58 GiB | 22.29 GiB | 25.05 GiB |
+| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 16.44 GiB | 16.59 GiB | 22.29 GiB | 25.05 GiB |
 +--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+
 
 $ # Parse FLUX.1-dev Model with Autoencoder tiling
@@ -512,13 +525,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 | model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
 +-------+------+-----------+--------------+---------------+-----------+------------+-----------+
 
-+-------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                        |
-+----------------+-------------------------------------------------+------------------+
-| DIFFUSION ARCH |                   CONDITIONERS                  |    AUTOENCODER   |
-+----------------+-------------------------------------------------+------------------+
-|     FLUX.1     | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) |
-+----------------+-------------------------------------------------+------------------+
++----------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                             |
++----------------+---------------------------------------------------------------+-------------------------+
+| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
++----------------+---------------------------------------------------------------+-------------------------+
+|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
++----------------+---------------------------------------------------------------+-------------------------+
 
 +---------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                  |
@@ -527,7 +540,7 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 |        |                 |             |               |                +------------+------------+-----------+-----------+
 |        |                 |             |               |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |
 +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
-| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 333.45 MiB | 483.45 MiB | 31.89 GiB | 36.28 GiB |
+| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 31.89 GiB | 36.28 GiB |
 +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
 
 $ # Parse FLUX.1-dev Model with multiple devices offloading
@@ -541,13 +554,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 | model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
 +-------+------+-----------+--------------+---------------+-----------+------------+-----------+
 
-+-------------------------------------------------------------------------------------+
-| ARCHITECTURE                                                                        |
-+----------------+-------------------------------------------------+------------------+
-| DIFFUSION ARCH |                   CONDITIONERS                  |    AUTOENCODER   |
-+----------------+-------------------------------------------------+------------------+
-|     FLUX.1     | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) |
-+----------------+-------------------------------------------------+------------------+
++----------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                             |
++----------------+---------------------------------------------------------------+-------------------------+
+| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
++----------------+---------------------------------------------------------------+-------------------------+
+|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
++----------------+---------------------------------------------------------------+-------------------------+
 
 +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                                                              |
@@ -556,31 +569,32 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg
 |        |                 |             |               |                +------------+------------+----------+----------+------------+--------+-----------+-----------+
 |        |                 |             |               |                |     UMA    |   NONUMA   |    UMA   |  NONUMA  |     UMA    | NONUMA |    UMA    |   NONUMA  |
 +--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+
-| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 333.45 MiB | 483.45 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB |  7 GiB | 22.29 GiB | 25.05 GiB |
+| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB |  7 GiB | 22.29 GiB | 25.05 GiB |
 +--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+
-
 ```
 
 ##### Parse None Model
 
 ```shell
 $ # Parse Multi-Modal Projector
-$ gguf-parser --hf-repo="bartowski/Qwen2-VL-72B-Instruct-GGUF" --hf-file="mmproj-Qwen2-VL-72B-Instruct-f16.gguf"                                                                        
-+---------------------------------------------------------------------------------------------------------------+
-| METADATA                                                                                                      |
-+-----------+-------------------------+------+--------------+---------------+----------+------------+-----------+
-|    TYPE   |           NAME          | ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW    |
-+-----------+-------------------------+------+--------------+---------------+----------+------------+-----------+
-| projector | Qwen2-VL-72B-Instruc... | clip |      F16     |      true     | 1.30 GiB |  699.36 M  | 16.01 bpw |
-+-----------+-------------------------+------+--------------+---------------+----------+------------+-----------+
-
-+----------------------------------------------------------------------+
-| ARCHITECTURE                                                         |
-+----------------+---------------+--------+------------------+---------+
-| PROJECTOR TYPE | EMBEDDING LEN | LAYERS | FEED FORWARD LEN | ENCODER |
-+----------------+---------------+--------+------------------+---------+
-| qwen2vl_merger |      1280     |   32   |         0        |  Vision |
-+----------------+---------------+--------+------------------+---------+
+$ gguf-parser --hf-repo="unsloth/Qwen2.5-Omni-3B-GGUF" --hf-file="mmproj-F32.gguf"                                                                        
++-------------------------------------------------------------------------------------------------------+
+| METADATA                                                                                              |
++-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
+|    TYPE   |       NAME      | ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW    |
++-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
+| projector | Qwen2.5-Omni-3B | clip |      F32     |      true     | 4.86 GiB |   1.31 B   | 31.93 bpw |
++-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
+
++-------------------------------------------------------------------------------------------------------------------------+
+| ARCHITECTURE                                                                                                            |
++----------------+-------------------------------+-----------------+-------------------------------------+----------------+
+| PROJECTOR TYPE |         EMBEDDING LEN         |      LAYERS     |           FEED FORWARD LEN          |     ENCODER    |
+|                +---------------+---------------+--------+--------+------------------+------------------+                |
+|                |     VISION    |     AUDIO     | VISION |  AUDIO |      VISION      |       AUDIO      |                |
++----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+
+|    qwen2.5o    |      1280     |      1280     |   32   |   32   |       1280       |       5120       | Vision & Audio |
++----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+
 
 $ # Parse LoRA Adapter
 $ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"
@@ -599,7 +613,6 @@ $ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-
 +--------------+------------+
 |     lora     |     32     |
 +--------------+------------+
-
 ```
 
 ### Estimate
@@ -641,7 +654,7 @@ flowchart TD
 ```
 
 ```shell
-$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="8,10" --in-short
+$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10" --estimate --in-short
 +------------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                     |
 +----------------------------------------------+--------------------------------------+----------------------------------------+
@@ -649,9 +662,8 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam
 +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
 | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
 +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
-|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     36 + 0     | 144 MiB | 17.83 GiB |     44 + 1     | 22.01 GiB | 22.57 GiB |
+|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     36 + 0     | 144 MiB | 17.83 GiB |     44 + 1     | 22.27 GiB | 22.83 GiB |
 +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
-
 ```
 
 Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following
@@ -659,9 +671,9 @@ resource consumption:
 
 | Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
 |-----------------------|---------------|-------------|----------------|--------------|------------|
-| host1                 | ENOUGH        | 388.08 MiB  |                |              | :thumbsup: |
-| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 17.79 GiB    |            |
-| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 22.51 GiB    |            |
+| host1                 | ENOUGH        | 399.27 MiB  |                |              | :thumbsup: |
+| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 17.83 GiB    |            |
+| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 22.83 GiB    |            |
 
 It appears that running the model on `host1` alone is not feasible.
 
@@ -694,7 +706,7 @@ flowchart TD
 ```
 
 ```shell
-$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --in-short
+$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --estimate --in-short
 +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                                                                                                                                 |
 +----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+
@@ -702,9 +714,8 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam
 +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
 | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    |
 +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
-|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     18 + 0     |   8.85 GiB   |   9.28 GiB   |     23 + 0     |   10.88 GiB  |   11.32 GiB  |     27 + 0     |   12.75 GiB  |   13.19 GiB  |     12 + 1     |   6.87 GiB   |   7.38 GiB   |
+|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     18 + 0     |   8.85 GiB   |   9.28 GiB   |     23 + 0     |   10.88 GiB  |   11.32 GiB  |     27 + 0     |   12.75 GiB  |   13.19 GiB  |     12 + 1     |   7.13 GiB   |   7.64 GiB   |
 +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
-
 ```
 
 According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the
@@ -712,11 +723,11 @@ following resource consumption:
 
 | Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
 |-----------------------|---------------|-------------|----------------|--------------|------------|
-| host4                 | 11 GiB        | 388.08 MiB  |                |              | :thumbsup: |
+| host4                 | 11 GiB        | 399.27 MiB  |                |              | :thumbsup: |
 | host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 9.28 GiB     |            |
 | host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 11.32 GiB    |            |
 | host2 (NVIDIA 4090)   |               |             | 12 GiB         | 13.19 GiB    |            |
-| host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 6.87 GiB     |            |
+| host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 7.13 GiB     |            |
 
 It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`.
 
@@ -746,17 +757,16 @@ flowchart TD
 ```
 
 ```shell
-$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --in-short
-+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                                                         |
-+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+---------------------------------------+
-|                      RAM                     |                 RPC 0 (V)RAM                 |                 RPC 1 (V)RAM                 |                 RPC 2 (V)RAM                 |                 RPC 3 (V)RAM                 |                 VRAM 0                |
-+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+-----------+----------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |    UMA    |  NONUMA  |
-+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+-----------+----------+
-|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     19 + 0     |   9.36 GiB   |   9.79 GiB   |     21 + 0     |   9.92 GiB   |   10.35 GiB  |     14 + 0     |   6.57 GiB   |   7.01 GiB   |     17 + 0     |   8.11 GiB   |   8.54 GiB   |      9 + 1     | 36.52 MiB | 5.91 GiB |
-+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+-----------+----------+
-
+$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --estimate --in-short
++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                                                          |
++----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------+
+|                      RAM                     |                 RPC 0 (V)RAM                 |                 RPC 1 (V)RAM                 |                 RPC 2 (V)RAM                 |                 RPC 3 (V)RAM                 |                 VRAM 0                 |
++--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
+| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |     UMA    |  NONUMA  |
++--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
+|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     19 + 0     |   9.36 GiB   |   9.79 GiB   |     21 + 0     |   9.92 GiB   |   10.35 GiB  |     14 + 0     |   6.57 GiB   |   7.01 GiB   |     17 + 0     |   8.11 GiB   |   8.54 GiB   |      9 + 1     | 302.50 MiB | 6.16 GiB |
++--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
 ```
 
 According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the
@@ -764,12 +774,12 @@ following resource consumption:
 
 | Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
 |-----------------------|---------------|-------------|----------------|--------------|------------|
-| host3 (Apple M1 Max)  | ENOUGH        | 238.08 MiB  |                |              | :thumbsup: |
+| host3 (Apple M1 Max)  | ENOUGH        | 249.27 MiB  |                |              | :thumbsup: |
 | host4                 | 11 GiB        | 9.79 GiB    |                |              | :thumbsup: |
-| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 10.36 GiB    | :thumbsup: |
+| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 10.35 GiB    | :thumbsup: |
 | host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 7.01 GiB     | :thumbsup: |
 | host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 8.54 GiB     | :thumbsup: |
-| host3 (Apple M1 Max)  |               |             | 6 GiB          | 36.52 MiB    | :thumbsup: |
+| host3 (Apple M1 Max)  |               |             | 6 GiB          | 302.50 MiB   | :thumbsup: |
 
 Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.
 
@@ -830,12 +840,12 @@ example and estimate the maximum tokens per second for Apple Silicon M-series us
 
 ```shell
 $ # Estimate full offloaded Q8_0 model
-$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \
+$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --estimate --in-short \
   -c 512 \
   --device-metric "<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>"
 
 $ # Estimate full offloaded Q4_0 model
-$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \
+$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --estimate --in-short \
   -c 512 \
   --device-metric "<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>"
 ```
@@ -886,7 +896,7 @@ $ # --device-metric "224GFLOPS;819.2GBps"         <-- Apple Mac Studio 0 CPU FLO
 $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 1 (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
 $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 2 (RPC 1) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
 $ # --device-metric "27.2TFLOPS;819.2GBps"        <-- Apple Mac Studio 0 iGPU FLOPS and VRAM Bandwidth
-$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \
+$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --estimate --in-short \
   --no-mmap \
   -c 512 \
   --rpc host1:port,host2:port \
@@ -923,7 +933,7 @@ $ # --device-metric "510.4GFLOPS;96GBps"           <-- Intel i5-14600k CPU FLOPS
 $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps"  <-- Apple Mac Studio (M2) (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
 $ # --device-metric "48.74TFLOPS;736.3GBps;64GBps" <-- NVIDIA GeForce RTX 0 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 5.0 x16 Bandwidth
 $ # --device-metric "48.74TFLOPS;736.3GBps;8GBps"  <-- NVIDIA GeForce RTX 1 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 4.0 x4 Bandwidth
-$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \
+$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --estimate --in-short \
   --no-mmap \
   -c 8192 \
   --rpc host:port \
@@ -946,23 +956,22 @@ $ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-ins
 #### Full Layers Offload (default)
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --in-short
-+--------------------------------------------------------------------------------------+
-| ESTIMATE                                                                             |
-+----------------------------------------------+---------------------------------------+
-|                      RAM                     |                 VRAM 0                |
-+--------------------+------------+------------+----------------+---------+------------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA   |
-+--------------------+------------+------------+----------------+---------+------------+
-|      1 + 0 + 0     | 652.53 MiB | 802.53 MiB |     126 + 1    | 126 GiB | 246.59 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --estimate --in-short
++-------------------------------------------------------------------------------------+
+| ESTIMATE                                                                            |
++------------------------------------------+------------------------------------------+
+|                    RAM                   |                  VRAM 0                  |
++--------------------+----------+----------+----------------+------------+------------+
+| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |
++--------------------+----------+----------+----------------+------------+------------+
+|      1 + 0 + 0     | 1.63 GiB | 1.78 GiB |     126 + 1    | 126.28 GiB | 246.86 GiB |
++--------------------+----------+----------+----------------+------------+------------+
 ```
 
 #### Zero Layers Offload
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers=0 --in-short
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=0 --estimate --in-short
 +------------------------------------------------------------------------------------+
 | ESTIMATE                                                                           |
 +----------------------------------------------+-------------------------------------+
@@ -970,25 +979,23 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4
 +--------------------+------------+------------+----------------+--------+-----------+
 | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |
 +--------------------+------------+------------+----------------+--------+-----------+
-|     1 + 126 + 1    | 126.37 GiB | 126.52 GiB |      0 + 0     |   0 B  | 32.34 GiB |
+|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |   0 B  | 33.62 GiB |
 +--------------------+------------+------------+----------------+--------+-----------+
-
 ```
 
 #### Specific Layers Offload
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers=10 --in-short
-+------------------------------------------------------------------------------------+
-| ESTIMATE                                                                           |
-+----------------------------------------------+-------------------------------------+
-|                      RAM                     |                VRAM 0               |
-+--------------------+------------+------------+----------------+--------+-----------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |
-+--------------------+------------+------------+----------------+--------+-----------+
-|     1 + 116 + 1    | 116.64 GiB | 116.78 GiB |     10 + 0     | 10 GiB | 50.39 GiB |
-+--------------------+------------+------------+----------------+--------+-----------+
-
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=10 --estimate --in-short
++----------------------------------------------------------------------------------+
+| ESTIMATE                                                                         |
++----------------------------------------------+-----------------------------------+
+|                      RAM                     |               VRAM 0              |
++--------------------+------------+------------+----------------+--------+---------+
+| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |  NONUMA |
++--------------------+------------+------------+----------------+--------+---------+
+|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |   0 B  | 250 MiB |
++--------------------+------------+------------+----------------+--------+---------+
 ```
 
 #### Specific Context Size
@@ -998,7 +1005,7 @@ By default, the context size retrieved from the model's metadata.
 Use `--ctx-size` to specify the context size.
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=4096 --in-short
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --ctx-size=4096 --estimate --in-short
 +--------------------------------------------------------------------------------------+
 | ESTIMATE                                                                             |
 +----------------------------------------------+---------------------------------------+
@@ -1008,7 +1015,6 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4
 +--------------------+------------+------------+----------------+----------+-----------+
 |      1 + 0 + 0     | 404.53 MiB | 554.53 MiB |     126 + 1    | 3.94 GiB | 93.28 GiB |
 +--------------------+------------+------------+----------------+----------+-----------+
-
 ```
 
 #### Enable Flash Attention
@@ -1023,17 +1029,16 @@ Please note that not all models support Flash Attention, if the model does not s
 Disabled" even if you enable it.
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --flash-attention --in-short
-+--------------------------------------------------------------------------------------+
-| ESTIMATE                                                                             |
-+----------------------------------------------+---------------------------------------+
-|                      RAM                     |                 VRAM 0                |
-+--------------------+------------+------------+----------------+---------+------------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA   |
-+--------------------+------------+------------+----------------+---------+------------+
-|      1 + 0 + 0     | 620.53 MiB | 770.53 MiB |     126 + 1    | 126 GiB | 215.70 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --flash-attention --estimate --in-short
++-------------------------------------------------------------------------------------+
+| ESTIMATE                                                                            |
++------------------------------------------+------------------------------------------+
+|                    RAM                   |                  VRAM 0                  |
++--------------------+----------+----------+----------------+------------+------------+
+| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |
++--------------------+----------+----------+----------------+------------+------------+
+|      1 + 0 + 0     | 1.63 GiB | 1.78 GiB |     126 + 1    | 126.28 GiB | 215.98 GiB |
++--------------------+----------+----------+----------------+------------+------------+
 ```
 
 #### Disable MMap
@@ -1049,7 +1054,7 @@ Please note that some models require loading the whole weight into memory, if th
 LOAD" shows "Not Supported".
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --no-mmap --in-short
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --no-mmap --estimate --in-short
 +-------------------------------------------------------------------------------------+
 | ESTIMATE                                                                            |
 +------------------------------------------+------------------------------------------+
@@ -1057,9 +1062,8 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4
 +--------------------+----------+----------+----------------+------------+------------+
 | LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |
 +--------------------+----------+----------+----------------+------------+------------+
-|      1 + 0 + 0     | 1.98 GiB | 2.13 GiB |     126 + 1    | 213.97 GiB | 246.59 GiB |
+|      1 + 0 + 0     | 2.97 GiB | 3.12 GiB |     126 + 1    | 214.24 GiB | 246.86 GiB |
 +--------------------+----------+----------+----------------+------------+------------+
-
 ```
 
 #### With Adapter
@@ -1067,19 +1071,19 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4
 Use `--lora`/`--control-vector` to estimate the usage when loading a model with adapters.
 
 ```shell
-$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --in-short
-+-----------------------------------------------------------------------------------+
-| ESTIMATE                                                                          |
-+----------------------------------------------+------------------------------------+
-|                      RAM                     |               VRAM 0               |
-+--------------------+------------+------------+----------------+--------+----------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |  NONUMA  |
-+--------------------+------------+------------+----------------+--------+----------+
-|      1 + 0 + 0     | 163.62 MiB | 313.62 MiB |     32 + 1     |  1 GiB | 6.79 GiB |
-+--------------------+------------+------------+----------------+--------+----------+
+$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --estimate --in-short
++-------------------------------------------------------------------------------------+
+| ESTIMATE                                                                            |
++----------------------------------------------+--------------------------------------+
+|                      RAM                     |                VRAM 0                |
++--------------------+------------+------------+----------------+----------+----------+
+| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  |
++--------------------+------------+------------+----------------+----------+----------+
+|      1 + 0 + 0     | 210.80 MiB | 360.80 MiB |     32 + 1     | 1.25 GiB | 7.04 GiB |
++--------------------+------------+------------+----------------+----------+----------+
 
 $ # With a LoRA adapter.
-$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --skip-metadata --skip-architecture --skip-tokenizer --in-short
+$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --estimate --in-short
 +-------------------------------------------------------------------------------------+
 | ESTIMATE                                                                            |
 +----------------------------------------------+--------------------------------------+
@@ -1087,9 +1091,8 @@ $ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="
 +--------------------+------------+------------+----------------+----------+----------+
 | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  |
 +--------------------+------------+------------+----------------+----------+----------+
-|      1 + 0 + 0     | 168.64 MiB | 318.64 MiB |     32 + 1     | 1.16 GiB | 6.94 GiB |
+|      1 + 0 + 0     | 223.91 MiB | 373.91 MiB |     32 + 1     | 1.42 GiB | 7.20 GiB |
 +--------------------+------------+------------+----------------+----------+----------+
-
 ```
 
 #### Get Proper Offload Layers
@@ -1097,61 +1100,60 @@ $ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="
 Use `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory.
 
 ```shell
-$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers-step=6 --in-short
-+--------------------------------------------------------------------------------------+
-| ESTIMATE                                                                             |
-+----------------------------------------------+---------------------------------------+
-|                      RAM                     |                 VRAM 0                |
-+--------------------+------------+------------+----------------+---------+------------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA   |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 126 + 1    | 126.59 GiB | 126.73 GiB |      0 + 0     |   0 B   |   250 MiB  |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 120 + 1    | 120.64 GiB | 120.78 GiB |      6 + 0     |  6 GiB  |  43.68 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 114 + 1    | 114.64 GiB | 114.78 GiB |     12 + 0     |  12 GiB |  53.74 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 108 + 1    | 108.64 GiB | 108.78 GiB |     18 + 0     |  18 GiB |  63.80 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 102 + 1    | 102.64 GiB | 102.78 GiB |     24 + 0     |  24 GiB |  73.86 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 96 + 1     |  96.64 GiB |  96.78 GiB |     30 + 0     |  30 GiB |  83.93 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 90 + 1     |  90.64 GiB |  90.78 GiB |     36 + 0     |  36 GiB |  93.99 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 84 + 1     |  84.64 GiB |  84.78 GiB |     42 + 0     |  42 GiB | 104.05 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 78 + 1     |  78.64 GiB |  78.78 GiB |     48 + 0     |  48 GiB | 114.11 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 72 + 1     |  72.64 GiB |  72.78 GiB |     54 + 0     |  54 GiB | 124.17 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 66 + 1     |  66.64 GiB |  66.78 GiB |     60 + 0     |  60 GiB | 134.23 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 60 + 1     |  60.64 GiB |  60.78 GiB |     66 + 0     |  66 GiB | 144.29 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 54 + 1     |  54.64 GiB |  54.78 GiB |     72 + 0     |  72 GiB | 154.35 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 48 + 1     |  48.64 GiB |  48.78 GiB |     78 + 0     |  78 GiB | 164.42 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 42 + 1     |  42.64 GiB |  42.78 GiB |     84 + 0     |  84 GiB | 174.48 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 36 + 1     |  36.64 GiB |  36.78 GiB |     90 + 0     |  90 GiB | 184.54 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 30 + 1     |  30.64 GiB |  30.78 GiB |     96 + 0     |  96 GiB | 194.60 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 24 + 1     |  24.64 GiB |  24.78 GiB |     102 + 0    | 102 GiB | 204.66 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 18 + 1     |  18.64 GiB |  18.78 GiB |     108 + 0    | 108 GiB | 214.72 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|     1 + 12 + 1     |  12.64 GiB |  12.78 GiB |     114 + 0    | 114 GiB | 225.05 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|      1 + 6 + 1     |  6.64 GiB  |  6.78 GiB  |     120 + 0    | 120 GiB | 235.64 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|      1 + 0 + 1     | 653.08 MiB | 803.08 MiB |     126 + 0    | 126 GiB | 246.24 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-|      1 + 0 + 0     | 652.53 MiB | 802.53 MiB |     126 + 1    | 126 GiB | 246.59 GiB |
-+--------------------+------------+------------+----------------+---------+------------+
-
+$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers-step=6 --estimate --in-short
++-----------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                |
++----------------------------------------------+------------------------------------------+
+|                      RAM                     |                  VRAM 0                  |
++--------------------+------------+------------+----------------+------------+------------+
+| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |     UMA    |   NONUMA   |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |     0 B    |   250 MiB  |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 120 + 1    | 121.90 GiB | 122.05 GiB |      6 + 0     |    6 GiB   |  44.68 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 114 + 1    | 115.90 GiB | 116.05 GiB |     12 + 0     |   12 GiB   |  54.74 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 108 + 1    | 109.90 GiB | 110.05 GiB |     18 + 0     |   18 GiB   |  64.80 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 102 + 1    | 103.90 GiB | 104.05 GiB |     24 + 0     |   24 GiB   |  74.86 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 96 + 1     |  97.90 GiB |  98.05 GiB |     30 + 0     |   30 GiB   |  84.93 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 90 + 1     |  91.90 GiB |  92.05 GiB |     36 + 0     |   36 GiB   |  94.99 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 84 + 1     |  85.90 GiB |  86.05 GiB |     42 + 0     |   42 GiB   | 105.05 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 78 + 1     |  79.90 GiB |  80.05 GiB |     48 + 0     |   48 GiB   | 115.11 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 72 + 1     |  73.90 GiB |  74.05 GiB |     54 + 0     |   54 GiB   | 125.17 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 66 + 1     |  67.90 GiB |  68.05 GiB |     60 + 0     |   60 GiB   | 135.23 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 60 + 1     |  61.90 GiB |  62.05 GiB |     66 + 0     |   66 GiB   | 145.29 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 54 + 1     |  55.90 GiB |  56.05 GiB |     72 + 0     |   72 GiB   | 155.35 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 48 + 1     |  49.90 GiB |  50.05 GiB |     78 + 0     |   78 GiB   | 165.42 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 42 + 1     |  43.90 GiB |  44.05 GiB |     84 + 0     |   84 GiB   | 175.48 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 36 + 1     |  37.90 GiB |  38.05 GiB |     90 + 0     |   90 GiB   | 185.54 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 30 + 1     |  31.90 GiB |  32.05 GiB |     96 + 0     |   96 GiB   | 195.60 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 24 + 1     |  25.90 GiB |  26.05 GiB |     102 + 0    |   102 GiB  | 205.66 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 18 + 1     |  19.90 GiB |  20.05 GiB |     108 + 0    |   108 GiB  | 215.72 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|     1 + 12 + 1     |  13.90 GiB |  14.05 GiB |     114 + 0    |   114 GiB  | 226.05 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|      1 + 6 + 1     |  7.90 GiB  |  8.05 GiB  |     120 + 0    |   120 GiB  | 236.64 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|      1 + 0 + 1     |  1.90 GiB  |  2.05 GiB  |     126 + 0    |   126 GiB  | 246.24 GiB |
++--------------------+------------+------------+----------------+------------+------------+
+|      1 + 0 + 0     |  1.63 GiB  |  1.78 GiB  |     126 + 1    | 126.28 GiB | 246.86 GiB |
++--------------------+------------+------------+----------------+------------+------------+
 ```
 
 ## License
diff --git a/vendor/github.com/gpustack/gguf-parser-go/cache.go b/vendor/github.com/gpustack/gguf-parser-go/cache.go
index 33fd753c..cb8311ad 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/cache.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/cache.go
@@ -55,7 +55,8 @@ func (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, error) {
 			return nil, fmt.Errorf("GGUF file cache get: %w", err)
 		}
 	}
-	if len(gf.Header.MetadataKV) == 0 || len(gf.TensorInfos) == 0 {
+
+	if len(gf.TensorInfos) == 0 {
 		_ = os.Remove(p)
 		return nil, ErrGGUFFileCacheCorrupted
 	}
diff --git a/vendor/github.com/gpustack/gguf-parser-go/file.go b/vendor/github.com/gpustack/gguf-parser-go/file.go
index 0c6a2e3a..ae6a2b08 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/file.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/file.go
@@ -419,6 +419,10 @@ func parseGGUFFile(fs []_GGUFFileReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, e
 
 // Types for GGUF hierarchical tensors.
 type (
+	// GGUFTensorInfoFilter is a filter to filter out if the given tensor name matches.
+	// Return true if the name matches, and false otherwise.
+	GGUFTensorInfoFilter func(name string) bool
+
 	// IGGUFTensorInfos is an interface for GGUF tensor infos,
 	// which includes basic operations.
 	IGGUFTensorInfos interface {
@@ -435,9 +439,9 @@ type (
 		// and the number of names found.
 		Index(names []string) (infos map[string]GGUFTensorInfo, found int)
 		// Elements returns the number of elements(parameters).
-		Elements() uint64
+		Elements(filter ...GGUFTensorInfoFilter) uint64
 		// Bytes returns the number of bytes.
-		Bytes() uint64
+		Bytes(filter ...GGUFTensorInfoFilter) uint64
 		// Count returns the number of tensors.
 		Count() uint64
 	}
@@ -877,11 +881,17 @@ func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo,
 // Elements returns the number of elements of the GGUFTensorInfo,
 // which is inspired by
 // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2597-L2601.
-func (ti GGUFTensorInfo) Elements() uint64 {
+func (ti GGUFTensorInfo) Elements(filter ...GGUFTensorInfoFilter) uint64 {
 	if ti.NDimensions == 0 {
 		return 0
 	}
 
+	for i := range filter {
+		if filter[i] != nil && !filter[i](ti.Name) {
+			return 0
+		}
+	}
+
 	ret := uint64(1)
 	for i := uint32(0); i < ti.NDimensions; i++ {
 		ret *= ti.Dimensions[i]
@@ -892,7 +902,7 @@ func (ti GGUFTensorInfo) Elements() uint64 {
 // Bytes returns the number of bytes of the GGUFTensorInfo,
 // which is inspired by
 // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2609-L2626.
-func (ti GGUFTensorInfo) Bytes() uint64 {
+func (ti GGUFTensorInfo) Bytes(filter ...GGUFTensorInfoFilter) uint64 {
 	if ti.NDimensions == 0 {
 		return 0
 	}
@@ -902,6 +912,12 @@ func (ti GGUFTensorInfo) Bytes() uint64 {
 		panic(fmt.Errorf("invalid type: %v", ti.Type))
 	}
 
+	for i := range filter {
+		if filter[i] != nil && !filter[i](ti.Name) {
+			return 0
+		}
+	}
+
 	// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L3210-L3214
 	nb := make([]uint64, 0, ti.NDimensions)
 	{
@@ -1061,7 +1077,7 @@ func (tis GGUFTensorInfos) layers() GGUFLayerTensorInfos {
 			}
 			l := pm[p].(*GGUFNamedTensorInfos)
 			l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
-		case (ps[0] == "v" || ps[0] == "t") && ps[1] == "blk":
+		case (ps[0] == "v" || ps[0] == "t" || ps[0] == "a") && ps[1] == "blk":
 			// LLaMACpp CLIP.
 			p := ps[0]
 			if _, ok := pm[p]; !ok {
@@ -1282,19 +1298,19 @@ func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTen
 }
 
 // Elements returns the number of elements of the GGUFLayerTensorInfos.
-func (ltis GGUFLayerTensorInfos) Elements() uint64 {
+func (ltis GGUFLayerTensorInfos) Elements(filter ...GGUFTensorInfoFilter) uint64 {
 	var ret uint64
 	for i := range ltis {
-		ret += ltis[i].Elements()
+		ret += ltis[i].Elements(filter...)
 	}
 	return ret
 }
 
 // Bytes returns the number of bytes of the GGUFLayerTensorInfos.
-func (ltis GGUFLayerTensorInfos) Bytes() uint64 {
+func (ltis GGUFLayerTensorInfos) Bytes(filter ...GGUFTensorInfoFilter) uint64 {
 	var ret uint64
 	for i := range ltis {
-		ret += ltis[i].Bytes()
+		ret += ltis[i].Bytes(filter...)
 	}
 	return ret
 }
@@ -1697,7 +1713,7 @@ func (rd _GGUFTensorInfoReader) Read() (ti GGUFTensorInfo, err error) {
 		}
 		ti.Type = GGMLType(v)
 		if ti.Type >= _GGMLTypeCount {
-			return ti, fmt.Errorf("invalid type: %v", ti.Type)
+			return ti, fmt.Errorf("%v: This quantized type is currently unsupported", ti.Type)
 		}
 	}
 
diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go b/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go
index f545d0ee..ad2f1ca9 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go
@@ -2,6 +2,7 @@ package gguf_parser
 
 import (
 	"regexp"
+	"slices"
 	"strings"
 )
 
@@ -41,6 +42,8 @@ type (
 		ExpertCount uint32 `json:"expertCount,omitempty"`
 		// ExpertUsedCount(n_expert_used) is the number of experts used during each token evaluation in MoE models.
 		ExpertUsedCount uint32 `json:"expertUsedCount,omitempty"`
+		// ExpertSharedCount(n_expert_shared) is the number of shared experts in MoE models.
+		ExpertSharedCount uint32 `json:"expertSharedCount,omitempty"`
 		// AttentionHeadCount(n_head) is the number of attention heads.
 		AttentionHeadCount uint64 `json:"attentionHeadCount,omitempty"`
 		// AttentionHeadCountKV(n_head_kv) is the number of attention heads per group used in Grouped-Query-Attention.
@@ -48,6 +51,14 @@ type (
 		// If not provided or equal to AttentionHeadCount,
 		// the model does not use Grouped-Query-Attention.
 		AttentionHeadCountKV uint64 `json:"attentionHeadCountKV,omitempty"`
+		// AttentionSlidingWindowPattern is the pattern used in the sliding window attention.
+		//
+		// 0 means all layers are Sliding Window Attention.
+		// 1 means all layers are none Sliding Window Attention.
+		// N means every Nth layer is none Sliding Window Attention.
+		AttentionSlidingWindowPattern uint32 `json:"attentionSlidingWindowPattern,omitempty"`
+		// AttentionSlidingWindow is the size of the sliding window used in the attention layer.
+		AttentionSlidingWindow uint64 `json:"attentionSlidingWindow,omitempty"`
 		// AttentionMaxALiBIBias is the maximum bias to use for ALiBI.
 		AttentionMaxALiBIBias float32 `json:"attentionMaxALiBIBias,omitempty"`
 		// AttentionClampKQV describes a value `C`,
@@ -58,20 +69,46 @@ type (
 		// AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization),
 		// which is a simplification of the original LayerNorm.
 		AttentionLayerNormRMSEpsilon float32 `json:"attentionLayerNormRMSEpsilon,omitempty"`
+		// AttentionQueryLORARank is the LORA rank of the query matrix.
+		//
+		// Zero means no LORA.
+		AttentionQueryLORARank uint32 `json:"attentionQueryLORARank,omitempty"`
+		// AttentionKeyValueLORARank is the LORA rank of the key/value matrix.
+		//
+		// Zero means no LORA.
+		AttentionKeyValueLORARank uint32 `json:"attentionKeyValueLORARank,omitempty"`
 		// AttentionKeyLength(n_embd_head_k) is the size of a key head.
 		//
 		// Defaults to `EmbeddingLength / AttentionHeadCount`.
 		AttentionKeyLength uint32 `json:"attentionKeyLength,omitempty"`
+		// AttentionKeyLengthMLA(n_embd_head_k_mla) is the size of a key head in MLA(Multi-Layer Attention).
+		//
+		// Zero means no MLA.
+		AttentionKeyLengthMLA uint32 `json:"attentionKeyLengthMLA,omitempty"`
 		// AttentionValueLength(n_embd_head_v) is the size of a value head.
 		//
 		// Defaults to `EmbeddingLength / AttentionHeadCount`.
 		AttentionValueLength uint32 `json:"attentionValueLength,omitempty"`
+		// AttentionValueLengthMLA(n_embd_head_v_mla) is the size of a value head in MLA(Multi-Layer Attention).
+		//
+		// Zero means no MLA.
+		AttentionValueLengthMLA uint32 `json:"attentionValueLengthMLA,omitempty"`
 		// AttentionCausal is true if the attention is causal.
 		AttentionCausal bool `json:"attentionCausal,omitempty"`
+		// AttentionRecurrent is true if the attention is recurrent.
+		//
+		// Used in Mamba, RWKV, and similar architectures.
+		AttentionRecurrent bool `json:"attentionRecurrent,omitempty"`
+		// AttentionHybrid is true if the attention is hybrid (causal (self-attention) + recurrent).
+		//
+		// Used in Jamba, Falcon-H1, and similar architectures.
+		AttentionHybrid bool `json:"attentionHybrid,omitempty"`
 		// RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding).
 		RoPEDimensionCount uint64 `json:"ropeDimensionCount,omitempty"`
 		// RoPEFrequencyBase is the base frequency of the RoPE.
 		RoPEFrequencyBase float32 `json:"ropeFrequencyBase,omitempty"`
+		// RoPEFrequencyScale is the scale frequency of the RoPE.
+		RoPEFrequencyScale float32 `json:"ropeFrequencyScale,omitempty"`
 		// RoPEFrequencyScale is the frequency scale of the RoPE.
 		RoPEScalingType string `json:"ropeScalingType,omitempty"`
 		// RoPEScalingFactor is the scaling factor of the RoPE.
@@ -80,14 +117,26 @@ type (
 		RoPEScalingOriginalContextLength uint64 `json:"ropeScalingOriginalContextLength,omitempty"`
 		// RoPEScalingFinetuned is true if the RoPE scaling is fine-tuned.
 		RoPEScalingFinetuned bool `json:"ropeScalingFinetuned,omitempty"`
-		// SSMConvolutionKernel is the size of the convolution kernel used in the SSM(Selective State Space Model).
+		// SSMConvolutionKernel is the size of the convolution kernel used in the Selective State Space Model (SSM) and similar architectures.
 		SSMConvolutionKernel uint32 `json:"ssmConvolutionKernel,omitempty"`
-		// SSMInnerSize is the embedding size of the state in SSM.
+		// SSMInnerSize is the embedding size of the state in SSM and similar architectures.
 		SSMInnerSize uint32 `json:"ssmInnerSize,omitempty"`
-		// SSMStateSize is the size of the recurrent state in SSM.
+		// SSMStateSize is the size of the recurrent state in SSM and similar architectures.
 		SSMStateSize uint32 `json:"ssmStateSize,omitempty"`
-		// SSMTimeStepRank is the rank of the time steps in SSM.
+		// SSMTimeStepRank is the rank of the time steps in SSM and similar architectures.
 		SSMTimeStepRank uint32 `json:"ssmTimeStepRank,omitempty"`
+		// SSMGroupCount is the number of groups in the SSM and similar architectures.
+		SSMGroupCount uint32 `json:"ssmGroupCount,omitempty"`
+		// WKVHeadSize is the size of the head in RWKV and similar architectures.
+		RWKVHeadSize uint32 `json:"rwkvHeadSize,omitempty"`
+		// RWKVRescaleEveryNLayers is the number of layers after which the rescaling is applied in RWKV and similar architectures.
+		RWKVRescaleEveryNLayers uint32 `json:"rwkvRescaleEveryNLayers,omitempty"`
+		// RWKVTimeMixExtraDimension indicates whether the RWKV architecture has an extra dimension for time mixing.
+		RWKVTimeMixExtraDimension uint32 `json:"rwkvTimeMixExtraDimension,omitempty"`
+		// RWKVTimeDecayExtraDimension indicates whether the RWKV architecture has an extra dimension for time decay.
+		RWKVTimeDecayExtraDimension uint32 `json:"rwkvTimeDecayExtraDimension,omitempty"`
+		// TokenShiftCount is the number of token shifts used in RWKV and similar architectures.
+		RWKVTokenShiftCount uint32 `json:"rwkvTokenShiftCount,omitempty"`
 		// VocabularyLength is the size of the vocabulary.
 		//
 		// VocabularyLength is the same as the tokenizer's token size.
@@ -95,13 +144,6 @@ type (
 
 		/* Appendix */
 
-		// EmbeddingGGQA is the GQA of the embedding layer.
-		EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"`
-		// EmbeddingKeyGQA is the number of key GQA in the embedding layer.
-		EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"`
-		// EmbeddingValueGQA is the number of value GQA in the embedding layer.
-		EmbeddingValueGQA uint64 `json:"embeddingValueGQA,omitempty"`
-
 		// ClipProjectorType is the type of the projector used in the clip model.
 		//
 		// Only used when Architecture is "clip".
@@ -109,47 +151,123 @@ type (
 		// ClipHasLLaVAProjector indicates whether the clip model has LLaVA projector or not.
 		//
 		// Only used when Architecture is "clip".
+		//
+		// Deprecated: use ClipProjectorType instead.
 		ClipHasLLaVAProjector bool `json:"clipHasLLaVAProjector,omitempty"`
 		// ClipHasMiniCPMVProjector indicates whether the clip model has MiniCPMV projector or not.
 		//
 		// Only used when Architecture is "clip".
+		//
+		// Deprecated: use ClipProjectorType instead.
 		ClipHasMiniCPMVProjector bool `json:"clipHasMiniCPMVProject,omitempty"`
 		// ClipMiniCPMVVersion is the version of the MiniCPMV projector.
 		//
-		// Only used when Architecture is "clip" and ClipHasMiniCPMVProjector is true.
+		// Only used when Architecture is "clip".
 		ClipMiniCPMVVersion int32 `json:"clipMiniCPMVVersion,omitempty"`
+		// ClipMiniCPMVQueryNum is the number of queries used in the MiniCPMV projector.
+		//
+		// Only used when Architecture is "clip".
+		ClipMiniCPMVQueryNum int32 `json:"clipMiniCPMVQueryNum,omitempty"`
 		// ClipHasGLMProjector indicates whether the clip model has GLM projector or not.
 		//
 		// Only used when Architecture is "clip".
+		//
+		// Deprecated: use ClipProjectorType instead.
 		ClipHasGLMProjector bool `json:"clipHasGLMProjector,omitempty"`
 		// ClipHasQwen2VLMerger indicates whether the clip model has Qwen2VL merger or not.
 		//
 		// Only used when Architecture is "clip".
-		ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"`
-		// ClipHasTextEncoder indicates whether the clip model has text encoder or not.
 		//
-		// Only used when Architecture is "clip".
-		ClipHasTextEncoder bool `json:"clipHasTextEncoder,omitempty"`
+		// Deprecated: use ClipProjectorType instead.
+		ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"`
 		// ClipHasVisionEncoder indicates whether the clip model has vision encoder or not.
 		//
 		// Only used when Architecture is "clip".
 		ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"`
-		// ClipVisionImageSize indicates the image size of vision encoder.
+		// ClipVisionEmbeddingLength indicates the embedding length of vision encoder.
 		//
 		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
-		ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"`
-		// ClipVisionPatchSize indicates the patch size of vision encoder.
+		ClipVisionEmbeddingLength uint64 `json:"clipVisionEmbeddingLength,omitempty"`
+		// ClipVisionBlockCount indicates the number of blocks in the vision encoder.
 		//
 		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
-		ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"`
+		ClipVisionBlockCount uint64 `json:"clipVisionBlockCount,omitempty"`
+		// ClipVisionFeedForwardLength indicates the feed-forward length of the vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionFeedForwardLength []uint64 `json:"clipVisionFeedForwardLength,omitempty"`
+		// ClipVisionAttentionHeadCount indicates the number of attention heads in the vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionAttentionHeadCount uint64 `json:"clipVisionAttentionHeadCount,omitempty"`
+		// ClipVisionAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionAttentionLayerNormRMSEpsilon float32 `json:"clipVisionAttentionLayerNormRMSEpsilon,omitempty"`
 		// ClipVisionProjectionDim indicates the projection dimension of vision encoder.
 		//
 		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
 		ClipVisionProjectionDim uint32 `json:"clipVisionProjectionDim,omitempty"`
+		// ClipVisionProjectorScaleFactor is the scale factor of the projector.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionProjectorScaleFactor uint32 `json:"clipVisionProjectorScaleFactor,omitempty"`
+		// ClipVisionImageSize indicates the image size of vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"`
+		// ClipVisionPatchSize indicates the patch size of vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"`
 		// ClipVisionMMPatchMergeType indicates the merge type of the vision encoder.
 		//
 		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
 		ClipVisionMMPatchMergeType string `json:"clipVisionMMPatchMergeType,omitempty"`
+		// ClipVisionSpatialMergeSize is the spatial merge size of the vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionSpatialMergeSize uint32 `json:"clipVisionSpatialMergeSize,omitempty"`
+		// ClipVisionWindowAttentionPattern is the Window Attention pattern used in the vision encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
+		ClipVisionWindowAttentionPattern uint32 `json:"clipVisionWindowAttentionPattern,omitempty"`
+		// ClipHasAudioEncoder indicates whether the clip model has audio encoder or not.
+		//
+		// Only used when Architecture is "clip".
+		ClipHasAudioEncoder bool `json:"clipHasAudioEncoder,omitempty"`
+		// ClipAudioEmbeddingLength indicates the embedding length of audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioEmbeddingLength uint64 `json:"clipAudioEmbeddingLength,omitempty"`
+		// ClipAudioBlockCount indicates the number of blocks in the audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioBlockCount uint64 `json:"clipAudioBlockCount,omitempty"`
+		// ClipAudioFeedForwardLength indicates the feed-forward length of the audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioFeedForwardLength []uint64 `json:"clipAudioFeedForwardLength,omitempty"`
+		// ClipAudioAttentionHeadCount indicates the number of attention heads in the audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioAttentionHeadCount uint64 `json:"clipAudioAttentionHeadCount,omitempty"`
+		// ClipAudioAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioAttentionLayerNormRMSEpsilon float32 `json:"clipAudioAttentionLayerNormRMSEpsilon,omitempty"`
+		// ClipAudioProjectionDim indicates the projection dimension of audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioProjectionDim uint32 `json:"clipAudioProjectionDim,omitempty"`
+		// ClipAudioProjectorStackFactor is the scale factor of the projector.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioProjectorStackFactor uint32 `json:"clipAudioProjectorStackFactor,omitempty"`
+		// ClipAudioNumMelBins is the number of mel bins used in the audio encoder.
+		//
+		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
+		ClipAudioNumMelBins uint32 `json:"clipAudioNumMelBins,omitempty"`
 
 		// AdapterType is the type of the adapter.
 		//
@@ -236,11 +354,11 @@ func (gaa GGUFArchitectureDiffusionAutoencoder) String() string {
 
 // Architecture returns the architecture metadata of the GGUF file.
 func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) {
-	if gf.TensorInfos.Match(regexp.MustCompile(`^model\.diffusion_model\..*`)) ||
-		gf.TensorInfos.Match(regexp.MustCompile(`^double_blocks\..*`)) {
-		return gf.diffuserArchitecture()
+	for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes {
+		if gf.TensorInfos.Match(re) {
+			return gf.diffuserArchitecture()
+		}
 	}
-
 	var (
 		generalTypeKey         = "general.type"
 		generalArchitectureKey = "general.architecture"
@@ -274,6 +392,8 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) {
 		return gf.adapterArchitecture(arch)
 	case typ == "adapter":
 		return gf.adapterArchitecture(arch)
+	case typ == "imatrix":
+		return gf.imatrixArchitecture(arch)
 	}
 	return gf.transformerArchitecture(arch)
 }
@@ -282,11 +402,16 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 	const (
 		// Diffusion
 
-		sdKey               = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x
-		sdXlKey             = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight"  // SD XL
-		sdXlRefinerKey      = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight"  // SD XL Refiner
-		sd3Key              = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight"                  // SD 3.x
-		sdInPaintFeatureKey = "model.diffusion_model.input_blocks.0.0.weight"                                   // SD in-paint feature
+		sdKey                = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x
+		sdKey2               = "output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight"
+		sdXlKey              = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL
+		sdXlKey2             = "output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight"
+		sdXlRefinerKey       = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner
+		sdXlRefinerKey2      = "output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight"
+		sd3Key               = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3.x
+		sd3Key2              = "joint_blocks.23.x_block.attn.proj.weight"
+		sdInPaintFeatureKey  = "model.diffusion_model.input_blocks.0.0.weight" // SD in-paint feature
+		sdInPaintFeatureKey2 = "input_blocks.0.0.weight"
 
 		fluxKey             = "model.diffusion_model.double_blocks.0.txt_attn.proj.weight" // FLUX.1
 		fluxKey2            = "double_blocks.0.txt_attn.proj.weight"
@@ -295,19 +420,28 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 
 		// Conditioner
 
-		openAiClipVitL14Key = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight"   // OpenAI CLIP ViT-L/14
-		openClipVitH14Key   = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight"   // OpenCLIP ViT-H/14
-		openClipVitG14Key   = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14
-		t5xxlKey            = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"      // Google T5-xxl
-		t5xxlKey2           = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"
+		openAiClipVitL14Key  = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight" // OpenAI CLIP ViT-L/14
+		openAiClipVitL14Key2 = "text_model.encoder.layers.11.self_attn.k_proj.weight"
+		openClipVitH14Key    = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight" // OpenCLIP ViT-H/14
+		openClipVitH14Key2   = "text_model.encoder.layers.22.self_attn.k_proj.weight"
+		openClipVitG14Key    = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14
+		openClipVitG14Key2   = "text_model.encoder.layers.31.self_attn.k_proj.weight"
+		t5xxlKey             = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" // Google T5-xxl
+		t5xxlKey2            = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"
+		t5xxlKey3            = "encoder.block.23.layer.0.SelfAttention.k.weight"
 	)
 
 	tis, _ := gf.TensorInfos.Index([]string{
 		sdKey,
+		sdKey2,
 		sdXlKey,
+		sdXlKey2,
 		sdXlRefinerKey,
+		sdXlRefinerKey2,
 		sd3Key,
+		sd3Key2,
 		sdInPaintFeatureKey,
+		sdInPaintFeatureKey2,
 
 		fluxKey,
 		fluxKey2,
@@ -315,10 +449,14 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 		fluxFillFeatureKey2,
 
 		openAiClipVitL14Key,
+		openAiClipVitL14Key2,
 		openClipVitH14Key,
+		openClipVitH14Key2,
 		openClipVitG14Key,
+		openClipVitG14Key2,
 		t5xxlKey,
 		t5xxlKey2,
+		t5xxlKey3,
 	})
 
 	ga.Type = "model"
@@ -332,6 +470,14 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 		if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 {
 			ga.DiffusionArchitecture += " InPaint"
 		}
+	} else if _, ok := tis[sdKey2]; ok {
+		ga.DiffusionArchitecture = "Stable Diffusion 1.x"
+		if ti.Dimensions[0] == 1024 {
+			ga.DiffusionArchitecture = "Stable Diffusion 2.x"
+		}
+		if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 {
+			ga.DiffusionArchitecture += " InPaint"
+		}
 	} else if _, ok := tis[sdXlKey]; ok {
 		ga.DiffusionArchitecture = "Stable Diffusion XL"
 		if _, ok = tis[sdXlRefinerKey]; ok {
@@ -340,9 +486,20 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 		if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 {
 			ga.DiffusionArchitecture += " InPaint"
 		}
+	} else if _, ok := tis[sdXlKey2]; ok {
+		ga.DiffusionArchitecture = "Stable Diffusion XL"
+		if _, ok = tis[sdXlRefinerKey2]; ok {
+			ga.DiffusionArchitecture = "Stable Diffusion XL Refiner"
+		}
+		if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 {
+			ga.DiffusionArchitecture += " InPaint"
+		}
 	} else if _, ok := tis[sd3Key]; ok {
 		ga.DiffusionArchitecture = "Stable Diffusion 3.x"
 		ga.DiffusionTransformer = true
+	} else if _, ok := tis[sd3Key2]; ok {
+		ga.DiffusionArchitecture = "Stable Diffusion 3.x"
+		ga.DiffusionTransformer = true
 	}
 	if _, ok := tis[fluxKey]; ok {
 		ga.DiffusionArchitecture = "FLUX.1"
@@ -370,12 +527,29 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 			}
 		}
 		ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)
+	} else if ti, ok := tis[openAiClipVitL14Key2]; ok {
+		cond := GGUFArchitectureDiffusionConditioner{
+			Architecture: "OpenAI CLIP ViT-L/14",
+			FileType:     ti.GetFileType(),
+		}
+		if ti, ok = tis[openClipVitH14Key2]; ok {
+			cond = GGUFArchitectureDiffusionConditioner{
+				Architecture: "OpenCLIP ViT-H/14",
+				FileType:     ti.GetFileType(),
+			}
+		}
+		ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)
 	}
 	if ti, ok := tis[openClipVitG14Key]; ok {
 		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
 			Architecture: "OpenCLIP ViT-G/14",
 			FileType:     ti.GetFileType(),
 		})
+	} else if ti, ok = tis[openClipVitG14Key2]; ok {
+		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
+			Architecture: "OpenCLIP ViT-G/14",
+			FileType:     ti.GetFileType(),
+		})
 	}
 	if ti, ok := tis[t5xxlKey]; ok {
 		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
@@ -387,12 +561,23 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 			Architecture: "Google T5-xxl",
 			FileType:     ti.GetFileType(),
 		})
+	} else if ti, ok = tis[t5xxlKey3]; ok {
+		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
+			Architecture: "Google T5-xxl",
+			FileType:     ti.GetFileType(),
+		})
 	}
 
-	if tis := gf.TensorInfos.Search(regexp.MustCompile(`^first_stage_model\..*`)); len(tis) != 0 {
-		ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{
-			Architecture: ga.DiffusionArchitecture + " VAE",
-			FileType:     GGUFTensorInfos(tis).GetFileType(),
+	for _, re := range []*regexp.Regexp{
+		regexp.MustCompile(`^first_stage_model\..*`),
+		regexp.MustCompile(`^decoder\.conv_in\..*`),
+	} {
+		if tis := gf.TensorInfos.Search(re); len(tis) != 0 {
+			ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{
+				Architecture: ga.DiffusionArchitecture + " VAE",
+				FileType:     GGUFTensorInfos(tis).GetFileType(),
+			}
+			break
 		}
 	}
 
@@ -401,30 +586,37 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 
 func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) {
 	const (
-		projectorTypeKey       = "clip.projector_type"
-		hasLLaVAProjectorKey   = "clip.has_llava_projector"
-		hasMiniCPMVProjector   = "clip.has_minicpmv_projector"
-		miniCPMVVersionKey     = "clip.minicpmv_version"
-		hasGLMProjectorKey     = "clip.has_glm_projector"
-		hasQwen2VLMergerKey    = "clip.has_qwen2vl_merger"
-		hasTextEncoderKey      = "clip.has_text_encoder"
-		hasVisionEncoderKey    = "clip.has_vision_encoder"
-		visionImageSizeKey     = "clip.vision.image_size"
-		visionPatchSizeKey     = "clip.vision.patch_size"
-		visionProjectionDim    = "clip.vision.projection_dim"
-		visionMMPatchMergeType = "clip.vision.mm_patch_merge_type"
-
-		textEmbeddingLengthKey              = "clip.text.embedding_length"
-		textBlockCountKey                   = "clip.text.block_count"
-		textFeedForwardLengthKey            = "clip.text.feed_forward_length"
-		textAttentionHeadCountKey           = "clip.text.attention.head_count"
-		textAttentionLayerNormRMSEpsilonKey = "clip.text.attention.layer_norm_epsilon"
-
+		projectorTypeKey     = "clip.projector_type"
+		hasLLaVAProjectorKey = "clip.has_llava_projector"
+		hasMiniCPMVProjector = "clip.has_minicpmv_projector"
+		miniCPMVVersionKey   = "clip.minicpmv_version"
+		miniCPMVQueryNumKey  = "clip.minicpmv_query_num"
+		hasGLMProjectorKey   = "clip.has_glm_projector"
+		hasQwen2VLMergerKey  = "clip.has_qwen2vl_merger"
+
+		hasVisionEncoderKey                   = "clip.has_vision_encoder"
 		visionEmbeddingLengthKey              = "clip.vision.embedding_length"
 		visionBlockCountKey                   = "clip.vision.block_count"
 		visionFeedForwardLengthKey            = "clip.vision.feed_forward_length"
 		visionAttentionHeadCountKey           = "clip.vision.attention.head_count"
 		visionAttentionLayerNormRMSEpsilonKey = "clip.vision.attention.layer_norm_epsilon"
+		visionProjectionDimKey                = "clip.vision.projection_dim"
+		visionProjectorScaleFactorKey         = "clip.vision.projector.scale_factor"
+		visionImageSizeKey                    = "clip.vision.image_size"
+		visionPatchSizeKey                    = "clip.vision.patch_size"
+		visionMMPatchMergeTypeKey             = "clip.vision.mm_patch_merge_type"
+		visioSpatialMergeSizeKey              = "clip.vision.spatial_merge_size"
+		visionWindowAttentionPatternKey       = "clip.vision.n_wa_pattern"
+
+		hasAudioEncoderKey                   = "clip.has_audio_encoder"
+		audioEmbeddingLengthKey              = "clip.audio.embedding_length"
+		audioBlockCountKey                   = "clip.audio.block_count"
+		audioFeedForwardLengthKey            = "clip.audio.feed_forward_length"
+		audioAttentionHeadCountKey           = "clip.audio.attention.head_count"
+		audioAttentionLayerNormRMSEpsilonKey = "clip.audio.attention.layer_norm_epsilon"
+		audioProjectionDimKey                = "clip.audio.projection_dim"
+		audioProjectorStackFactorKey         = "clip.audio.projector.stack_factor"
+		audioNumMelBinsKey                   = "clip.audio.num_mel_bins"
 	)
 
 	ga.Type = "projector"
@@ -435,24 +627,33 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) {
 		hasLLaVAProjectorKey,
 		hasMiniCPMVProjector,
 		miniCPMVVersionKey,
+		miniCPMVQueryNumKey,
 		hasGLMProjectorKey,
 		hasQwen2VLMergerKey,
-		hasTextEncoderKey,
+		// Vision
 		hasVisionEncoderKey,
-		visionImageSizeKey,
-		visionPatchSizeKey,
-		visionProjectionDim,
-		visionMMPatchMergeType,
-		textEmbeddingLengthKey,
-		textBlockCountKey,
-		textFeedForwardLengthKey,
-		textAttentionHeadCountKey,
-		textAttentionLayerNormRMSEpsilonKey,
 		visionEmbeddingLengthKey,
 		visionBlockCountKey,
 		visionFeedForwardLengthKey,
 		visionAttentionHeadCountKey,
 		visionAttentionLayerNormRMSEpsilonKey,
+		visionProjectionDimKey,
+		visionProjectorScaleFactorKey,
+		visionImageSizeKey,
+		visionPatchSizeKey,
+		visionMMPatchMergeTypeKey,
+		visioSpatialMergeSizeKey,
+		visionWindowAttentionPatternKey,
+		// Audio
+		hasAudioEncoderKey,
+		audioEmbeddingLengthKey,
+		audioBlockCountKey,
+		audioFeedForwardLengthKey,
+		audioAttentionHeadCountKey,
+		audioAttentionLayerNormRMSEpsilonKey,
+		audioProjectionDimKey,
+		audioProjectorStackFactorKey,
+		audioNumMelBinsKey,
 	})
 
 	if v, ok := m[projectorTypeKey]; ok {
@@ -469,96 +670,109 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) {
 	if v, ok := m[miniCPMVVersionKey]; ok {
 		ga.ClipMiniCPMVVersion = ValueNumeric[int32](v)
 	}
+	if v, ok := m[miniCPMVQueryNumKey]; ok {
+		ga.ClipMiniCPMVQueryNum = ValueNumeric[int32](v)
+	}
 	if v, ok := m[hasGLMProjectorKey]; ok {
 		ga.ClipHasGLMProjector = v.ValueBool()
 	}
 	if v, ok := m[hasQwen2VLMergerKey]; ok {
 		ga.ClipHasQwen2VLMerger = v.ValueBool()
 	}
-	if v, ok := m[hasTextEncoderKey]; ok {
-		ga.ClipHasTextEncoder = v.ValueBool()
-	}
+	// Vision
 	if v, ok := m[hasVisionEncoderKey]; ok {
 		ga.ClipHasVisionEncoder = v.ValueBool()
 	}
+	if v, ok := m[visionEmbeddingLengthKey]; ok {
+		ga.ClipVisionEmbeddingLength = ValueNumeric[uint64](v)
+	}
+	if v, ok := m[visionBlockCountKey]; ok {
+		ga.ClipVisionBlockCount = ValueNumeric[uint64](v)
+	}
+	if v, ok := m[visionFeedForwardLengthKey]; ok {
+		if v.ValueType == GGUFMetadataValueTypeArray {
+			ga.ClipVisionFeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
+		} else {
+			vx := ValueNumeric[uint64](v)
+			ga.ClipVisionFeedForwardLength = make([]uint64, ga.ClipVisionBlockCount)
+			for i := range ga.ClipVisionFeedForwardLength {
+				ga.ClipVisionFeedForwardLength[i] = vx
+			}
+		}
+	}
+	if v, ok := m[visionAttentionHeadCountKey]; ok {
+		ga.ClipVisionAttentionHeadCount = ValueNumeric[uint64](v)
+	}
+	if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok {
+		ga.ClipVisionAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
+	}
 	if v, ok := m[visionImageSizeKey]; ok {
 		ga.ClipVisionImageSize = ValueNumeric[uint32](v)
 	}
+	if v, ok := m[visionProjectionDimKey]; ok {
+		ga.ClipVisionProjectionDim = ValueNumeric[uint32](v)
+	}
+	ga.ClipVisionProjectorScaleFactor = 1
+	if ga.ClipProjectorType == "gemma3" {
+		ga.ClipVisionProjectorScaleFactor = 4
+	}
+	if v, ok := m[visionProjectorScaleFactorKey]; ok {
+		ga.ClipVisionProjectorScaleFactor = ValueNumeric[uint32](v)
+	}
+	ga.ClipVisionPatchSize = 1
 	if v, ok := m[visionPatchSizeKey]; ok {
 		ga.ClipVisionPatchSize = ValueNumeric[uint32](v)
 	}
-	if v, ok := m[visionProjectionDim]; ok {
-		ga.ClipVisionProjectionDim = ValueNumeric[uint32](v)
-	}
 	ga.ClipVisionMMPatchMergeType = "flat"
-	if v, ok := m[visionMMPatchMergeType]; ok {
+	if v, ok := m[visionMMPatchMergeTypeKey]; ok {
 		ga.ClipVisionMMPatchMergeType = v.ValueString()
 	}
-
-	if v, ok := m[textEmbeddingLengthKey]; ok {
-		ga.EmbeddingLength = ValueNumeric[uint64](v)
+	if v, ok := m[visioSpatialMergeSizeKey]; ok {
+		ga.ClipVisionSpatialMergeSize = ValueNumeric[uint32](v)
 	}
-	if v, ok := m[textBlockCountKey]; ok {
-		ga.BlockCount = ValueNumeric[uint64](v)
+	if v, ok := m[visionWindowAttentionPatternKey]; ok {
+		ga.ClipVisionWindowAttentionPattern = ValueNumeric[uint32](v)
 	}
-	if v, ok := m[textFeedForwardLengthKey]; ok {
-		if v.ValueType == GGUFMetadataValueTypeArray {
-			ga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
-		} else {
-			vx := ValueNumeric[uint64](v)
-			ga.FeedForwardLength = make([]uint64, ga.BlockCount)
-			for i := range ga.FeedForwardLength {
-				ga.FeedForwardLength[i] = vx
-			}
-		}
+	// Audio
+	if v, ok := m[hasAudioEncoderKey]; ok {
+		ga.ClipHasAudioEncoder = v.ValueBool()
 	}
-	if v, ok := m[textAttentionHeadCountKey]; ok {
-		ga.AttentionHeadCount = ValueNumeric[uint64](v)
+	if v, ok := m[audioEmbeddingLengthKey]; ok {
+		ga.ClipAudioEmbeddingLength = ValueNumeric[uint64](v)
 	}
-	if v, ok := m[textAttentionLayerNormRMSEpsilonKey]; ok {
-		ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
-	}
-
-	if v, ok := m[visionEmbeddingLengthKey]; ok {
-		ga.EmbeddingLength = ValueNumeric[uint64](v)
+	if v, ok := m[audioBlockCountKey]; ok {
+		ga.ClipAudioBlockCount = ValueNumeric[uint64](v)
 	}
-	if v, ok := m[visionBlockCountKey]; ok {
-		ga.BlockCount = ValueNumeric[uint64](v)
-	}
-	if v, ok := m[visionFeedForwardLengthKey]; ok {
+	if v, ok := m[audioFeedForwardLengthKey]; ok {
 		if v.ValueType == GGUFMetadataValueTypeArray {
-			ga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
+			ga.ClipAudioFeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
 		} else {
 			vx := ValueNumeric[uint64](v)
-			ga.FeedForwardLength = make([]uint64, ga.BlockCount)
-			for i := range ga.FeedForwardLength {
-				ga.FeedForwardLength[i] = vx
+			ga.ClipAudioFeedForwardLength = make([]uint64, ga.ClipAudioBlockCount)
+			for i := range ga.ClipAudioFeedForwardLength {
+				ga.ClipAudioFeedForwardLength[i] = vx
 			}
 		}
 	}
-	if v, ok := m[visionAttentionHeadCountKey]; ok {
-		ga.AttentionHeadCount = ValueNumeric[uint64](v)
+	if v, ok := m[audioAttentionHeadCountKey]; ok {
+		ga.ClipAudioAttentionHeadCount = ValueNumeric[uint64](v)
 	}
-	if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok {
-		ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
+	if v, ok := m[audioAttentionLayerNormRMSEpsilonKey]; ok {
+		ga.ClipAudioAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
+	}
+	if v, ok := m[audioProjectionDimKey]; ok {
+		ga.ClipAudioProjectionDim = ValueNumeric[uint32](v)
+	}
+	ga.ClipAudioProjectorStackFactor = 1
+	if v, ok := m[audioProjectorStackFactorKey]; ok {
+		ga.ClipAudioProjectorStackFactor = ValueNumeric[uint32](v)
+	}
+	if v, ok := m[audioNumMelBinsKey]; ok {
+		ga.ClipAudioNumMelBins = ValueNumeric[uint32](v)
 	}
 
 	ga.AttentionHeadCountKV = ga.AttentionHeadCount
 
-	{
-		if ga.AttentionHeadCountKV > 0 {
-			ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV
-		}
-		if ga.AttentionHeadCount > 0 {
-			ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
-			ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV
-		}
-		if ga.Architecture == "mamba" {
-			ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize)
-			ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize)
-		}
-	}
-
 	return ga
 }
 
@@ -597,6 +811,13 @@ func (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitecture) {
 	return ga
 }
 
+func (gf *GGUFFile) imatrixArchitecture(_ string) (ga GGUFArchitecture) {
+	ga.Type = "imatrix"
+	ga.Architecture = "imatrix"
+
+	return ga
+}
+
 func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 	var (
 		contextLengthKey     = arch + ".context_length"
@@ -608,21 +829,28 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 		expertSharedFeedForwardLengthKey = arch + ".expert_shared_feed_forward_length"
 		expertCountKey                   = arch + ".expert_count"
 		expertUsedCountKey               = arch + ".expert_used_count"
+		expertSharedCountKey             = arch + ".expert_shared_count"
 
 		attentionHeadCountKey           = arch + ".attention.head_count"
 		attentionHeadCountKVKey         = arch + ".attention.head_count_kv"
+		attentionSlidingWindowKey       = arch + ".attention.sliding_window"
 		attentionMaxALiBIBiasKey        = arch + ".attention.max_alibi_bias"
 		attentionMaxALiBIBiasKey2       = arch + ".attention.alibi_bias_max"
 		attentionClampKQVKey            = arch + ".attention.clamp_kqv"
 		attentionClampKQVKey2           = arch + ".attention.clip_kqv"
 		attentionLayerNormEpsilonKey    = arch + ".attention.layer_norm_epsilon"
 		attentionLayerNormRMSEpsilonKey = arch + ".attention.layer_norm_rms_epsilon"
+		attentionQueryLORARankKey       = arch + ".attention.q_lora_rank"
+		attentionKeyValueLORARankKey    = arch + ".attention.kv_lora_rank"
 		attentionKeyLengthKey           = arch + ".attention.key_length"
+		attentionKeyLengthMLAKey        = arch + ".attention.key_length_mla"
 		attentionValueLengthKey         = arch + ".attention.value_length"
+		attentionValueLengthMLAKey      = arch + ".attention.value_length_mla"
 		attentionCausalKey              = arch + ".attention.causal"
 
 		ropeDimensionCountKey         = arch + ".rope.dimension_count"
 		ropeFrequencyBaseKey          = arch + ".rope.freq_base"
+		ropeFrequencyScaleKey         = arch + ".rope.freq_scale"
 		ropeScaleLinearKey            = arch + ".rope.scale_linear"
 		ropeScalingTypeKey            = arch + ".rope.scaling.type"
 		ropeScalingFactorKey          = arch + ".rope.scaling.factor"
@@ -633,6 +861,13 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 		ssmInnerSizeKey         = arch + ".ssm.inner_size"
 		ssmStateSizeKey         = arch + ".ssm.state_size"
 		ssmTimeStepRankKey      = arch + ".ssm.time_step_rank"
+		ssmGroupCountKey        = arch + ".ssm.group_count"
+
+		rwkvHeadSizeKey                = arch + ".wkv.head_size"
+		rwkvRescaleEveryNLayersKey     = arch + ".rescale_every_n_layers"
+		rwkvTimeMixExtraDimensionKey   = arch + ".time_mix_extra_dim"
+		rwkvTimeDecayExtraDimensionKey = arch + ".time_decay_extra_dim"
+		rwkvTokenShiftCountKey         = arch + ".token_shift_count"
 
 		vocabularyLengthKey    = arch + ".vocab_size"
 		tokenizerGGMLTokensKey = "tokenizer.ggml.tokens"
@@ -650,19 +885,26 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 		expertSharedFeedForwardLengthKey,
 		expertCountKey,
 		expertUsedCountKey,
+		expertSharedCountKey,
 		attentionHeadCountKey,
 		attentionHeadCountKVKey,
+		attentionSlidingWindowKey,
 		attentionMaxALiBIBiasKey,
 		attentionMaxALiBIBiasKey2,
 		attentionClampKQVKey,
 		attentionClampKQVKey2,
 		attentionLayerNormEpsilonKey,
 		attentionLayerNormRMSEpsilonKey,
+		attentionQueryLORARankKey,
+		attentionKeyValueLORARankKey,
 		attentionKeyLengthKey,
+		attentionKeyLengthMLAKey,
 		attentionValueLengthKey,
+		attentionValueLengthMLAKey,
 		attentionCausalKey,
 		ropeDimensionCountKey,
 		ropeFrequencyBaseKey,
+		ropeFrequencyScaleKey,
 		ropeScaleLinearKey,
 		ropeScalingTypeKey,
 		ropeScalingFactorKey,
@@ -672,6 +914,12 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 		ssmInnerSizeKey,
 		ssmStateSizeKey,
 		ssmTimeStepRankKey,
+		ssmGroupCountKey,
+		rwkvHeadSizeKey,
+		rwkvRescaleEveryNLayersKey,
+		rwkvTimeMixExtraDimensionKey,
+		rwkvTimeDecayExtraDimensionKey,
+		rwkvTokenShiftCountKey,
 		vocabularyLengthKey,
 		tokenizerGGMLTokensKey,
 	})
@@ -703,6 +951,9 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 	if v, ok := m[expertUsedCountKey]; ok {
 		ga.ExpertUsedCount = ValueNumeric[uint32](v)
 	}
+	if v, ok := m[expertSharedCountKey]; ok {
+		ga.ExpertSharedCount = ValueNumeric[uint32](v)
+	}
 	if v, ok := m[expertFeedForwardLengthKey]; ok {
 		ga.ExpertFeedForwardLength = ValueNumeric[uint64](v)
 	}
@@ -726,6 +977,33 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 	} else {
 		ga.AttentionHeadCountKV = ga.AttentionHeadCount
 	}
+	ga.AttentionSlidingWindowPattern = 1
+	if v, ok := m[attentionSlidingWindowKey]; ok {
+		if v.ValueType == GGUFMetadataValueTypeArray {
+			ga.AttentionSlidingWindow = ValuesNumeric[uint64](v.ValueArray())[0]
+		} else {
+			ga.AttentionSlidingWindow = ValueNumeric[uint64](v)
+		}
+	}
+	switch arch {
+	case "llama4":
+		if ga.AttentionSlidingWindow == 0 {
+			ga.AttentionSlidingWindow = 8192
+		}
+		ga.AttentionSlidingWindowPattern = 4
+	case "phi3":
+		// See https://github.com/ggml-org/llama.cpp/pull/13676
+		ga.AttentionSlidingWindow = 0
+	case "gemma2":
+		if ga.AttentionSlidingWindow == 0 {
+			ga.AttentionSlidingWindow = 4096
+		}
+		ga.AttentionSlidingWindowPattern = 2
+	case "gemma3":
+		ga.AttentionSlidingWindowPattern = 6
+	case "cohere2":
+		ga.AttentionSlidingWindowPattern = 4
+	}
 	if v, ok := m[attentionMaxALiBIBiasKey]; ok {
 		ga.AttentionMaxALiBIBias = ValueNumeric[float32](v)
 	} else if v, ok := m[attentionMaxALiBIBiasKey2]; ok {
@@ -742,37 +1020,76 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 	if v, ok := m[attentionLayerNormRMSEpsilonKey]; ok {
 		ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
 	}
+	if v, ok := m[attentionQueryLORARankKey]; ok {
+		ga.AttentionQueryLORARank = ValueNumeric[uint32](v)
+	}
+	if v, ok := m[attentionKeyValueLORARankKey]; ok {
+		ga.AttentionKeyValueLORARank = ValueNumeric[uint32](v)
+	}
 	if v, ok := m[attentionKeyLengthKey]; ok {
 		ga.AttentionKeyLength = ValueNumeric[uint32](v)
 	} else if ga.AttentionHeadCount != 0 {
 		ga.AttentionKeyLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount)
 	}
+	if v, ok := m[attentionKeyLengthMLAKey]; ok {
+		ga.AttentionKeyLengthMLA = ValueNumeric[uint32](v)
+	}
 	if v, ok := m[attentionValueLengthKey]; ok {
 		ga.AttentionValueLength = ValueNumeric[uint32](v)
 	} else if ga.AttentionHeadCount != 0 {
 		ga.AttentionValueLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount)
 	}
+	if v, ok := m[attentionValueLengthMLAKey]; ok {
+		ga.AttentionValueLengthMLA = ValueNumeric[uint32](v)
+	}
 	if v, ok := m[attentionCausalKey]; ok {
 		ga.AttentionCausal = v.ValueBool()
 	} else {
 		ga.AttentionCausal = true
 	}
+	// See https://github.com/ggml-org/llama.cpp/blob/6491d6e4f1caf0ad2221865b4249ae6938a6308c/src/llama-arch.cpp#L1913-L1924.
+	ga.AttentionRecurrent = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata.
+		"mamba",
+		"mamba2",
+		"rwkv6",
+		"rwkv6qwen2",
+		"rwkv7",
+		"arwkv7",
+	}, ga.Architecture)
+	// See https://github.com/ggml-org/llama.cpp/blob/a57d1bcb3c0165ac87b1f0dbb429839b0da69689/src/llama-arch.cpp#L2029-L2038.
+	ga.AttentionHybrid = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata.
+		"jamba",
+		"falcon-h1",
+		"granitehybrid",
+	}, ga.Architecture)
+	ga.AttentionRecurrent = ga.AttentionHybrid || ga.AttentionRecurrent
 
 	if v, ok := m[ropeDimensionCountKey]; ok {
 		ga.RoPEDimensionCount = ValueNumeric[uint64](v)
 	}
+	ga.RoPEFrequencyBase = 10000.0
 	if v, ok := m[ropeFrequencyBaseKey]; ok {
 		ga.RoPEFrequencyBase = ValueNumeric[float32](v)
 	}
-	if v, ok := m[ropeScaleLinearKey]; ok {
-		ga.RoPEScalingType = "linear"
-		ga.RoPEScalingFactor = ValueNumeric[float32](v)
+	ga.RoPEFrequencyScale = 1.0
+	if v, ok := m[ropeFrequencyScaleKey]; ok {
+		ga.RoPEFrequencyScale = ValueNumeric[float32](v)
 	}
 	if v, ok := m[ropeScalingTypeKey]; ok {
 		ga.RoPEScalingType = v.ValueString()
 	}
+	if v, ok := m[ropeScaleLinearKey]; ok {
+		ga.RoPEScalingType = "linear"
+		ga.RoPEScalingFactor = ValueNumeric[float32](v)
+		if ga.RoPEScalingFactor != 0 {
+			ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor
+		}
+	}
 	if v, ok := m[ropeScalingFactorKey]; ok {
 		ga.RoPEScalingFactor = ValueNumeric[float32](v)
+		if ga.RoPEScalingFactor != 0 {
+			ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor
+		}
 	}
 	if v, ok := m[ropeScalingOriginalContextKey]; ok {
 		ga.RoPEScalingOriginalContextLength = ValueNumeric[uint64](v)
@@ -793,6 +1110,27 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 	if v, ok := m[ssmTimeStepRankKey]; ok {
 		ga.SSMTimeStepRank = ValueNumeric[uint32](v)
 	}
+	if v, ok := m[ssmGroupCountKey]; ok {
+		ga.SSMGroupCount = ValueNumeric[uint32](v)
+	}
+
+	if v, ok := m[rwkvHeadSizeKey]; ok {
+		ga.RWKVHeadSize = ValueNumeric[uint32](v)
+	}
+	if v, ok := m[rwkvRescaleEveryNLayersKey]; ok {
+		ga.RWKVRescaleEveryNLayers = ValueNumeric[uint32](v)
+	}
+	if v, ok := m[rwkvTimeMixExtraDimensionKey]; ok {
+		ga.RWKVTimeMixExtraDimension = ValueNumeric[uint32](v)
+	}
+	if v, ok := m[rwkvTimeDecayExtraDimensionKey]; ok {
+		ga.RWKVTimeDecayExtraDimension = ValueNumeric[uint32](v)
+	}
+	if v, ok := m[rwkvTokenShiftCountKey]; ok {
+		ga.RWKVTokenShiftCount = ValueNumeric[uint32](v)
+	} else if ga.AttentionRecurrent {
+		ga.RWKVTokenShiftCount = 2
+	}
 
 	if v, ok := m[vocabularyLengthKey]; ok {
 		ga.VocabularyLength = ValueNumeric[uint64](v)
@@ -800,19 +1138,5 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
 		ga.VocabularyLength = v.ValueArray().Len
 	}
 
-	{
-		if ga.AttentionHeadCountKV > 0 {
-			ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV
-		}
-		if ga.AttentionHeadCount > 0 {
-			ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
-			ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV
-		}
-		if ga.Architecture == "mamba" {
-			ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize)
-			ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize)
-		}
-	}
-
 	return ga
 }
diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go
index 5a49aa37..a47fb0ca 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go
@@ -1,6 +1,7 @@
 package gguf_parser
 
 import (
+	"math"
 	"regexp"
 	"slices"
 	"strings"
@@ -73,7 +74,11 @@ type (
 	LLaMACppRunDeviceUsage struct {
 		// HandleLayers is the number of layers that the device can handle.
 		HandleLayers uint64 `json:"handleLayers"`
-		// HandleLastLayer is the index of the last layer the device can handle.
+		// HandleSWALayers is the number of layers that the device can handle in sliding window attention (SWA),
+		// the non SWA layers is `HandleLayers - HandleSWALayers`.
+		HandleSWALayers uint64 `json:"handleSWALayers"`
+		// HandleLastLayer is the index of the last layer the device can handle,
+		// -1 means the device does not handle the last layer.
 		HandleLastLayer int `json:"handleLastLayer"`
 		// HandleOutputLayer is the flag to indicate whether the device can handle the output layer,
 		// true for handle.
@@ -87,6 +92,8 @@ type (
 		// If Remote is true, Position is the position of the remote devices,
 		// Otherwise, Position is the position of the device in the local devices.
 		Position int `json:"position"`
+		// Endpoint is the endpoint of the remote device, empty for local devices.
+		Endpoint string `json:"endpoint,omitempty"`
 		// Footprint is the memory footprint for bootstrapping.
 		Footprint GGUFBytesScalar `json:"footprint"`
 		// Parameter is the running parameters that the device processes.
@@ -107,6 +114,8 @@ type (
 		Input GGUFParametersScalar `json:"input"`
 		// Compute is the parameter usage for compute tensors.
 		Compute GGUFParametersScalar `json:"compute"`
+		// ComputeOverridden is the parameter usage for overridden compute tensors.
+		ComputeOverridden GGUFParametersScalar `json:"computeOverridden"`
 		// Output is the parameter usage for output tensors.
 		Output GGUFParametersScalar `json:"output"`
 	}
@@ -117,6 +126,8 @@ type (
 		Input GGUFBytesScalar `json:"input"`
 		// Compute is the memory usage for loading compute tensors.
 		Compute GGUFBytesScalar `json:"compute"`
+		// ComputeOverridden is the memory usage for loading overridden compute tensors.
+		ComputeOverridden GGUFBytesScalar `json:"computeOverridden"`
 		// Output is the memory usage for loading output tensors.
 		Output GGUFBytesScalar `json:"output"`
 	}
@@ -142,7 +153,7 @@ type (
 	}
 )
 
-// EstimateLLaMACppRun returns the inference estimated result of the GGUF file.
+// EstimateLLaMACppRun estimates the usages of the GGUF file in llama.cpp.
 func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate) {
 	// Options
 	var o _GGUFRunEstimateOptions
@@ -196,6 +207,7 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMAC
 		e.Devices[j+1].Remote = j < len(o.RPCServers)
 		if e.Devices[j+1].Remote {
 			e.Devices[j+1].Position = j
+			e.Devices[j+1].Endpoint = o.RPCServers[j]
 		} else {
 			e.Devices[j+1].Position = j - len(o.RPCServers)
 		}
@@ -215,22 +227,24 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMAC
 	case "projector":
 		// For projector model,
 		// see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008.
-		if ptr.Deref(o.LMCOffloadLayers, a.BlockCount) != 0 {
-			// None model means full offload.
-			o.LMCOffloadLayers = ptr.To(a.BlockCount)
+		if ptr.Deref(o.LMCOffloadLayers, math.MaxUint64) != 0 {
+			// Full offload.
+			o.LMCOffloadLayers = ptr.To[uint64](math.MaxUint64)
 		} else {
-			// None model means zero offload.
+			// Zero offload.
 			o.LMCOffloadLayers = ptr.To[uint64](0)
 		}
 		gf.estimateLLaMACppRunInProjector(&o, &a, &e)
 	case "adapter":
-		gf.estimateLLaMaCppRunInAdapter(&o, &a, &e)
+		gf.estimateLLaMACppRunInAdapter(&o, &a, &e)
+	case "imatrix":
+		gf.estimateLLaMACppRunInIMatrix(&o, &a, &e)
 	}
 
 	return e
 }
 
-// estimateLLaMACppRunInModel estimates the inference result of the GGUF file in llama.cpp for model type,
+// estimateLLaMACppRunInModel estimates the usages of the GGUF file for model,
 // including the usages of footprint, weight, KV cache, and computation.
 func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, t *GGUFTokenizer, e *LLaMACppRunEstimate) {
 	ls := gf.Layers()
@@ -251,6 +265,9 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		a.BlockCount = uint64(len(tfLs))
 	}
 
+	// Using sliding window attention.
+	usingSWA := a.AttentionSlidingWindowPattern != 1 && !o.LMCFullSizeSWACache
+
 	// Full offload: nLoadLayers == 0 && isOffloadOutputLayer
 	// Zero offload: nOffloadLayers == 0
 	// Partial offload: !Full offload && !Zero offload
@@ -260,7 +277,8 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		nLoadLayers          = a.BlockCount
 		idxOutputDevice      int
 
-		fullOffload, zeroOffload bool
+		fullOffload, zeroOffload          bool
+		nSWALoadLayers, nSWAOffloadLayers uint64
 	)
 	{
 		var isOffloadOutputLayer bool
@@ -289,17 +307,25 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		e.FullOffloaded = fullOffload
 		e.OffloadLayers = nOffloadLayers
 
-		for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {
+		for i, j, offloadStart := uint64(0), 0, a.BlockCount-nOffloadLayers; i < a.BlockCount; i++ {
 			switch {
-			case i < int(nLoadLayers):
+			case i < nLoadLayers:
 				e.Devices[0].HandleLayers += 1
-				e.Devices[0].HandleLastLayer = i
+				e.Devices[0].HandleLastLayer = int(i)
+				if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) {
+					e.Devices[0].HandleSWALayers += 1
+					nSWALoadLayers += 1
+				}
 			case i >= offloadStart:
 				x := float64(i-offloadStart) / float64(nActualOffloadLayers)
 				j = slicex.UpperBound(o.TensorSplitFraction, x)
 				e.Devices[j+1].HandleLayers += 1
-				e.Devices[j+1].HandleLastLayer = i
-				if fullOffload && i == len(tfLs)-1 {
+				e.Devices[j+1].HandleLastLayer = int(i)
+				if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) {
+					e.Devices[j+1].HandleSWALayers += 1
+					nSWAOffloadLayers += 1
+				}
+				if fullOffload && i == a.BlockCount-1 {
 					idxOutputDevice = j + 1
 				}
 			}
@@ -315,11 +341,6 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		if a.Architecture == "grok" {
 			o.FlashAttention = false
 		}
-		// Attention key length must be equal to attention value length,
-		// see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9571-L9574.
-		if a.AttentionKeyLength != a.AttentionValueLength {
-			o.FlashAttention = false
-		}
 		// Fallback to FP16 if the value type is quantized when disabling flash attention,
 		// see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9576-L9579.
 		if o.LMCCacheValueType.IsQuantized() && !o.FlashAttention {
@@ -331,9 +352,21 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 
 	// Embedding.
 	if !a.AttentionCausal {
+		ropeFrequencyBase := ptr.Deref(o.LMCRoPEFrequencyBase, a.RoPEFrequencyBase)
+		ropeFrequencyScale := ptr.Deref(o.LMCRoPEFrequencyScale, a.RoPEFrequencyScale)
+		ropeScalingType := ptr.Deref(o.LMCRoPEScalingType, a.RoPEScalingType)
+		ropeScalingOriginalContextSize := ptr.Deref(o.LMCRoPEScalingOriginalContextSize, int32(a.RoPEScalingOriginalContextLength))
+		isRoPECustomized := ropeFrequencyBase != a.RoPEFrequencyBase ||
+			ropeFrequencyScale != a.RoPEFrequencyScale ||
+			ropeScalingType != a.RoPEScalingType ||
+			(ropeScalingType == "yarn" && ropeScalingOriginalContextSize != int32(a.RoPEScalingOriginalContextLength))
+
 		e.EmbeddingOnly = true
+		o.LMCContextSize = ptr.To(ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength)))
 		// Set context size/physical batch size/logical batch size to the training context size.
-		o.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength))))
+		if !isRoPECustomized {
+			o.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), *o.LMCContextSize))
+		}
 		o.LMCLogicalBatchSize = o.LMCContextSize
 		o.LMCPhysicalBatchSize = o.LMCLogicalBatchSize
 		// Reranking.
@@ -350,15 +383,21 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 	e.LogicalBatchSize = *o.LMCLogicalBatchSize
 	e.PhysicalBatchSize = *o.LMCPhysicalBatchSize
 
+	// Padding alignment.
+	paddingAlign := uint64(32)
+	if o.FlashAttention {
+		paddingAlign = 256
+	}
+
 	// Init hyperparameters,
 	// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6957-L7000.
 	var (
-		nContext  uint64
-		nTokens   uint64
-		nBatch    uint64
-		nOutputs  uint64
-		nParallel uint64
-		nKV       uint64
+		nContext uint64
+		nTokens  uint64
+		nBatch   uint64
+		nOutputs uint64
+		nSeq     uint64
+		nKV      uint64
 	)
 	{
 		nContext = a.MaximumContextLength
@@ -370,27 +409,16 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		}
 		// Padding context size,
 		// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002.
-		if o.FlashAttention {
-			nContext = GGMLPadding(nContext, 256)
-		} else {
-			nContext = GGMLPadding(nContext, 32)
-		}
+		nContext = GGMLPadding(nContext, paddingAlign)
+
 		// Correct token size,
 		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224.
 		nTokens = min(nContext, uint64(*o.LMCPhysicalBatchSize))
 		nBatch = nTokens
 		nOutputs = nTokens
-		nParallel = uint64(ptr.Deref(o.ParallelSize, 1))
+		nSeq = uint64(ptr.Deref(o.ParallelSize, 1))
 		nKV = nContext
 
-		// For mamba,
-		// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129.
-		if a.Architecture == "mamba" {
-			nKV = nParallel
-			o.LMCCacheKeyType = ptr.To(GGMLTypeF32)
-			o.LMCCacheValueType = ptr.To(GGMLTypeF32)
-		}
-
 		e.ContextSize = nContext
 	}
 
@@ -410,7 +438,10 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 
 		// Output buffer,
 		// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003.
-		ob := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel
+		ob := a.EmbeddingLength * nOutputs * 4 /* float32 size */
+		if a.AttentionCausal {
+			ob += a.VocabularyLength * nOutputs * 4 /* float32 size */
+		}
 		if fullOffload {
 			e.Devices[idxOutputDevice].Footprint += GGUFBytesScalar(ob)
 		} else {
@@ -420,6 +451,66 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 
 	// Weight & Parameter.
 	{
+		filter := func(idx int) GGUFTensorInfoFilter {
+			if len(o.OverriddenTensors) == 0 {
+				return nil
+			}
+			return func(name string) bool {
+				for _, ot := range o.OverriddenTensors {
+					bt, bi := ot.ParseBufferType()
+					switch {
+					case bt == GGUFRunOverriddenTensorBufferTypeUnknown:
+						continue
+					case bt == GGUFRunOverriddenTensorBufferTypeCPU && idx == 0:
+						continue
+					case bt == GGUFRunOverriddenTensorBufferTypeGPU &&
+						(e.Devices[idx].Remote || anyx.Number[int](bi)+1 != idx):
+						continue
+					case bt == GGUFRunOverriddenTensorBufferTypeRPC &&
+						(!e.Devices[idx].Remote || e.Devices[idx].Endpoint != bi):
+						continue
+					}
+					if ot.PatternRegex.MatchString(name) {
+						return false
+					}
+				}
+				return true
+			}
+		}
+
+		// If overridden tensors are provided,
+		// we need to search the tensors of the overridden pattern,
+		// and place them in the correct device.
+		if len(o.OverriddenTensors) != 0 {
+			for _, ot := range o.OverriddenTensors {
+				bt, bi := ot.ParseBufferType()
+				if bt == GGUFRunOverriddenTensorBufferTypeUnknown {
+					continue
+				}
+				var sls GGUFTensorInfos = ls.Search(ot.PatternRegex)
+				if len(sls) == 0 {
+					continue
+				}
+				switch bt {
+				case GGUFRunOverriddenTensorBufferTypeCPU:
+					e.Devices[0].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())
+					e.Devices[0].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())
+				case GGUFRunOverriddenTensorBufferTypeGPU:
+					idx := anyx.Number[int](bi) + 1
+					e.Devices[idx].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())
+					e.Devices[idx].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())
+				default:
+					for i, d := range e.Devices[1:] {
+						if d.Endpoint == bi {
+							e.Devices[i+1].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())
+							e.Devices[i+1].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())
+							break
+						}
+					}
+				}
+			}
+		}
+
 		// Compute.
 		for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {
 			idx := 0
@@ -428,8 +519,9 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 				j = slicex.UpperBound(o.TensorSplitFraction, x)
 				idx = j + 1
 			}
-			e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
-			e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements())
+			f := filter(idx)
+			e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes(f))
+			e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements(f))
 		}
 
 		// IO,
@@ -443,7 +535,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		if _, ok := opLs.Get("output.weight"); ok {
 			wg = GGUFBytesScalar(opLs.Bytes())
 			ps = GGUFParametersScalar(opLs.Elements())
-		} else if a.AttentionCausal {
+		} else {
 			wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */
 			ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements())
 		}
@@ -456,34 +548,105 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		}
 	}
 
-	// KV cache,
-	// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501.
-	{
-		kps, vps := a.EmbeddingKeyGQA*nKV, a.EmbeddingValueGQA*nKV
-		krs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps})
-
-		e.Devices[0].KVCache.Key = GGUFBytesScalar(krs * nLoadLayers)
-		e.Devices[0].KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers)
-		e.Devices[0].Parameter.KVCache = GGUFParametersScalar((kps + vps) * nLoadLayers)
-		if !*o.LMCOffloadKVCache {
-			e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers)
-			e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers)
-			e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers)
-		} else if !zeroOffload {
-			for i, d := range e.Devices[1:] {
-				e.Devices[i+1].KVCache.Key = GGUFBytesScalar(krs * d.HandleLayers)
-				e.Devices[i+1].KVCache.Value = GGUFBytesScalar(vrs * d.HandleLayers)
-				e.Devices[i+1].Parameter.KVCache = GGUFParametersScalar((kps + vps) * d.HandleLayers)
+	// KV cache.
+	if a.AttentionCausal {
+		switch {
+		// Recurrent,
+		// see https://github.com/ggml-org/llama.cpp/blob/704bb7a71c01dc07c1478b85f6322bf5dfde1eaf/src/llama-hparams.cpp#L68-L88.
+		case a.AttentionRecurrent:
+			var r, s uint64
+			if a.RWKVHeadSize > 0 {
+				r = uint64(a.RWKVTokenShiftCount) * a.EmbeddingLength
+				s = uint64(a.RWKVHeadSize) * a.EmbeddingLength
+			} else {
+				r = uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize))
+				s = uint64(a.SSMStateSize * a.SSMInnerSize)
+			}
+
+			rps, sps := r*nSeq, s*nSeq
+			rrs, srs := GGMLTypeF32.RowSizeOf([]uint64{rps}), GGMLTypeF32.RowSizeOf([]uint64{sps})
+
+			e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nLoadLayers)
+			e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nLoadLayers)
+			e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nLoadLayers)
+			if !*o.LMCOffloadKVCache {
+				e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nOffloadLayers)
+				e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nOffloadLayers)
+				e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nOffloadLayers)
+			} else if !zeroOffload {
+				for i, d := range e.Devices[1:] {
+					e.Devices[i+1].KVCache.Key += GGUFBytesScalar(rrs * d.HandleLayers)
+					e.Devices[i+1].KVCache.Value += GGUFBytesScalar(srs * d.HandleLayers)
+					e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * d.HandleLayers)
+				}
+			}
+
+			if !a.AttentionHybrid {
+				break
+			}
+
+			fallthrough
+		// Causal,
+		// see https://github.com/ggml-org/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501.
+		default:
+			akl, avl := uint64(a.AttentionKeyLength), uint64(a.AttentionValueLength)
+			if a.AttentionKeyLengthMLA > 0 && a.AttentionValueLengthMLA > 0 {
+				akl, avl = uint64(a.AttentionKeyLengthMLA), uint64(a.AttentionValueLengthMLA)
+			}
+			kGQA := akl * a.AttentionHeadCountKV
+			vGQA := avl * a.AttentionHeadCountKV
+			kps, vps := kGQA*nKV, vGQA*nKV
+			krs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps})
+
+			if !usingSWA {
+				e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nLoadLayers)
+				e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nLoadLayers)
+				e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nLoadLayers)
+				if !*o.LMCOffloadKVCache {
+					e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers)
+					e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers)
+					e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers)
+				} else if !zeroOffload {
+					for i, d := range e.Devices[1:] {
+						e.Devices[i+1].KVCache.Key += GGUFBytesScalar(krs * d.HandleLayers)
+						e.Devices[i+1].KVCache.Value += GGUFBytesScalar(vrs * d.HandleLayers)
+						e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((kps + vps) * d.HandleLayers)
+					}
+				}
+			} else {
+				// Sliding window attention size,
+				// see https://github.com/ggml-org/llama.cpp/blob/3079e9ac8e04ef6eddeb0c164d72edb6b6fd2df5/src/llama-kv-cache.cpp#L1640-L1642.
+				swas := min(nKV, GGMLPadding(a.AttentionSlidingWindow*nSeq+uint64(*o.LMCLogicalBatchSize), paddingAlign))
+				swaKps, swaVps := kGQA*swas, vGQA*swas
+				swaKrs, swaVrs := o.LMCCacheKeyType.RowSizeOf([]uint64{swaKps}), o.LMCCacheValueType.RowSizeOf([]uint64{swaVps})
+
+				nNonSWALoadLayers, nNonSWAOffloadLayers := nLoadLayers-nSWALoadLayers, nOffloadLayers-nSWAOffloadLayers
+
+				e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWALoadLayers + krs*nNonSWALoadLayers)
+				e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWALoadLayers + vrs*nNonSWALoadLayers)
+				e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWALoadLayers + (kps+vps)*nNonSWALoadLayers)
+				if !*o.LMCOffloadKVCache {
+					e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWAOffloadLayers + krs*nNonSWAOffloadLayers)
+					e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWAOffloadLayers + vrs*nNonSWAOffloadLayers)
+					e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWAOffloadLayers + (kps+vps)*nNonSWAOffloadLayers)
+				} else if !zeroOffload {
+					for i, d := range e.Devices[1:] {
+						e.Devices[i+1].KVCache.Key += GGUFBytesScalar(swaKrs*d.HandleSWALayers + krs*(d.HandleLayers-d.HandleSWALayers))
+						e.Devices[i+1].KVCache.Value += GGUFBytesScalar(swaVrs*d.HandleSWALayers + vrs*(d.HandleLayers-d.HandleSWALayers))
+						e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*d.HandleSWALayers + (kps+vps)*(d.HandleLayers-d.HandleSWALayers))
+					}
+				}
 			}
 		}
 	}
 
 	// Computation.
 	{
-		// Bootstrap, compute metadata,
-		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136.
-		cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum +
-			GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false)
+		// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243.
+		maxNodes := max(1024, uint64(8*len(gf.TensorInfos)))
+
+		// Bootstrap, compute metadata.
+		cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
 		e.Devices[0].Computation.Footprint = GGUFBytesScalar(cm)
 
 		// Scheduler overhead,
@@ -506,21 +669,19 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 			inpPos    = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]
 			inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nOutputs})                  // I32 [n_outputs],
 			inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nKV, nBatch})               // F32 [n_kv, n_batch]
-			inpSMask  = GGMLTypeF32.RowSizeOf([]uint64{1, nKV})                    // F32 [1, n_kv]
-			inpSSeq   = GGMLTypeI32.RowSizeOf([]uint64{nKV, nBatch})               // I32 [n_kv, n_batch]
+			inpSMask  = GGMLTypeF32.RowSizeOf([]uint64{1, nSeq})                   // F32 [1, n_seq]
+			inpSSeq   = GGMLTypeI32.RowSizeOf([]uint64{nSeq, nBatch})              // I32 [n_seq, n_batch]
 		)
-		switch {
-		case a.Architecture == "mamba":
-			e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpSMask + inpSSeq + inpOutIds)
-		default:
+		if a.AttentionRecurrent {
+			e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + 2*inpSMask + inpSSeq + inpOutIds)
+		} else {
 			e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds)
 		}
-		if !zeroOffload {
+		{
 			var v GGUFBytesScalar
-			switch {
-			case a.Architecture == "mamba":
+			if a.AttentionRecurrent {
 				v = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq)
-			default:
+			} else {
 				v = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask)
 			}
 			if len(o.RPCServers) == 0 && len(o.TensorSplitFraction) > 1 {
@@ -538,35 +699,61 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		// the allocated memory can be reused for the next layer.
 		// So, we only consider the usage of the largest layer,
 		// which is the last layer by default.
-		switch {
-		case a.Architecture == "mamba":
-			convInc := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingKeyGQA, nKV}) // F32 [n_embd_key_gqa, n_kv] reshape
-			for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) {
-				if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") {
-					rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
+		if a.AttentionRecurrent && !a.AttentionHybrid {
+			if a.RWKVHeadSize > 0 {
+				attnInc := uint64(0)
+				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|attn_norm_2)\.weight`)) {
+					rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
+					attnInc += rs
+				}
+				ffnInc := uint64(0)
+				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.time_mix_(lerp_x|receptance|decay_w2|key|value|gate|w2|output)\.weight`)) { // nolint: lll
+					switch {
+					case strings.HasSuffix(l.Name, ".time_mix_w2.weight"):
+						rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, 1, nTokens, l.Dimensions[l.NDimensions-1]})
+						ffnInc += rs
+					case strings.HasSuffix(l.Name, ".time_mix_output.weight"):
+						rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch + uint64(a.RWKVHeadSize)*nSeq})
+						ffnInc += rs
+					default:
+						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
+						ffnInc += rs
+					}
+				}
+				cp := GGUFBytesScalar(attnInc + ffnInc)
+				for i := range e.Devices[1:] {
+					e.Devices[i+1].Computation.Compute = cp
+				}
+			} else {
+				r := uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize))
+				convInc := GGMLTypeF32.RowSizeOf([]uint64{r, nSeq}) // F32 [n_embd_key_gqa, nSeq] reshape
+				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) {
+					if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") {
+						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
+						convInc += rs
+						continue
+					}
+					// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379.
+					rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nSeq})
 					convInc += rs
-					continue
 				}
-				// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379.
-				rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nKV})
-				convInc += rs
-			}
-			ssmInc := uint64(0)
-			for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) {
-				if !strings.HasSuffix(l.Name, ".ssm_a") {
-					rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
+				ssmInc := uint64(0)
+				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) {
+					if !strings.HasSuffix(l.Name, ".ssm_a") {
+						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
+						ssmInc += rs
+						continue
+					}
+					// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413.
+					rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nSeq})
 					ssmInc += rs
-					continue
 				}
-				// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413.
-				rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nKV})
-				ssmInc += rs
-			}
-			cp := GGUFBytesScalar(convInc + ssmInc)
-			for i := range e.Devices[1:] {
-				e.Devices[i+1].Computation.Compute = cp
+				cp := GGUFBytesScalar(convInc + ssmInc)
+				for i := range e.Devices[1:] {
+					e.Devices[i+1].Computation.Compute = cp
+				}
 			}
-		default:
+		} else {
 			loadAttnInc, offloadAttnInc := uint64(0), uint64(0)
 			{
 				rs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})
@@ -577,7 +764,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 			if o.FlashAttention {
 				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387.
 				offloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens})
-				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) {
+				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv|q_b)\.weight`)) {
 					if strings.HasSuffix(l.Name, ".attn_norm.weight") {
 						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
 						offloadAttnInc += rs
@@ -644,7 +831,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 			} else {
 				e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc)
 			}
-			if !zeroOffload {
+			{
 				cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc))
 				for i := range e.Devices[1:] {
 					e.Devices[i+1].Computation.Compute = cp
@@ -663,9 +850,13 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		// Finally, get the usage of output layer.
 		if a.AttentionCausal {
 			var outInc uint64
-			if a.Architecture == "mamba" {
+			if a.AttentionRecurrent {
 				outInc += inpSMask + inpSSeq
 			}
+			if l, ok := opLs.Get("output_norm.weight"); ok {
+				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
+				outInc += rs
+			}
 			if l, ok := opLs.Get("output.weight"); ok {
 				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
 				outInc += rs
@@ -692,7 +883,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 		bs := anyx.Number[float64](*o.LMCLogicalBatchSize) / float64(nBatch)
 		for i, dm := range dmss {
 			fl, upbw, dwbw := float64(max(dm.FLOPS, 1)), float64(max(dm.UpBandwidth, 1)), float64(max(dm.DownBandwidth, 1))
-			cmpops := float64(ds[i].Parameter.Compute)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output)
+			cmpops := float64(ds[i].Parameter.Compute+ds[i].Parameter.ComputeOverridden)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output) // nolint: lll
 			cmps := float64(ds[i].Weight.Sum())
 			cmplat := max(cmpops/fl, cmps/upbw)
 			kvcops := float64(ds[i].Parameter.KVCache) * 2 /* FMA */ * bs
@@ -715,83 +906,161 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 	}
 }
 
+// estimateLLaMACppRunInProjector estimates the usages of the GGUF file for projector.
 func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
 	ls := gf.Layers()
 	ioLs, tfLs, _ := ls.Cut([]string{
+		"mm.*",
+		// Vision specific IO layers.
 		"v.patch_embd.*",
 		"v.class_embd",
 		"v.position_embd.*",
 		"v.pre_ln.*",
-		"model.*",
 		"v.post_ln.*",
-		"mm.*",
+		"model.*",
 		"resampler.*",
+		// Audio specific IO layers.
+		"a.position_embd.*",
+		"a.conv1d.*",
+		"a.post_ln.*",
 	})
 	ipLs, opLs, _ := ioLs.Cut([]string{
+		// Vision specific Input layers.
 		"v.patch_embd.*",
 		"v.class_embd",
 		"v.position_embd.*",
 		"v.pre_ln.*",
 		"model.*",
+		// Audio specific Input layers.
+		"a.position_embd.*",
+		"a.conv1d.*",
 	})
 
-	if a.BlockCount == 0 {
-		a.BlockCount = uint64(len(tfLs))
+	// Block count.
+	if a.ClipHasVisionEncoder && a.ClipVisionBlockCount == 0 {
+		if len(tfLs) == 1 {
+			if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"v"}, ntfLs.Name) {
+				a.ClipVisionBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos))
+			}
+		}
+		if a.ClipVisionBlockCount == 0 {
+			a.ClipVisionBlockCount = uint64(len(tfLs))
+		}
+	}
+	if a.ClipHasAudioEncoder && a.ClipAudioBlockCount == 0 {
+		if len(tfLs) == 1 {
+			if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"a"}, ntfLs.Name) {
+				a.ClipAudioBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos))
+			}
+		}
+		if a.ClipAudioBlockCount == 0 {
+			a.ClipAudioBlockCount = uint64(len(tfLs))
+		}
 	}
 
-	e.FullOffloaded = *o.LMCOffloadLayers == a.BlockCount
-	e.OffloadLayers = *o.LMCOffloadLayers
+	// Offload layers.
+	if *o.LMCOffloadLayers == math.MaxUint64 {
+		e.FullOffloaded = true
+		e.OffloadLayers = a.ClipVisionBlockCount + a.ClipAudioBlockCount
+		o.LMCOffloadLayers = ptr.To(e.OffloadLayers)
+	} else {
+		e.FullOffloaded = false
+		e.OffloadLayers = 0
+	}
 
-	// Init hyperparameters,
-	// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636.
-	var (
-		imgHeightSize     uint64
-		imgWidthSize      uint64
-		imgPatchSize      uint64
-		nPatchesHeight    uint64
-		nPatchesWidth     uint64
-		nPatches          uint64
-		imgPatchesMaxSize uint64
-		imgPatches        uint64
-		projectionDim     uint64 // NB(thxCode): do not sure if there is the correct name.
-	)
+	// Footprint.
 	{
+		// Bootstrap.
+		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
+	}
+
+	idx := 0 // Default to the main host's RAM.
+	if e.FullOffloaded {
+		for i := 1; i < len(e.Devices); i++ {
+			if !e.Devices[i].Remote {
+				idx = i
+				break
+			}
+		}
+	}
+
+	// Weight & Parameter.
+	{
+		// Compute.
+		e.Devices[idx].HandleLayers = *o.LMCOffloadLayers
+		e.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1)
+		e.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes())
+		e.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements())
+
+		// IO.
+		e.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes())
+		e.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements())
+		e.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes())
+		e.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements())
+	}
+
+	if a.ClipHasVisionEncoder {
+		// Init hyperparameters,
+		// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636.
+		var (
+			heightMaxSize uint64 // y
+			widthMaxSize  uint64 // x
+			// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L3462.
+			nPatches       uint64
+			patchesMaxSize uint64
+			// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L4016.
+			projectionDim uint64 // NB(thxCode): do not sure if there is the correct name.
+		)
 		// See https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L397-L411,
 		//     https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2323-L2345,
 		//     https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2767-L2794.
-		imgHeightSize = uint64(a.ClipVisionImageSize)
-		imgWidthSize = imgHeightSize
-		imgPatchSize = uint64(a.ClipVisionPatchSize)
-		if a.ClipHasQwen2VLMerger {
-			imgHeightSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 224))
-			imgWidthSize = imgHeightSize
+		heightMaxSize = uint64(a.ClipVisionImageSize)
+		widthMaxSize = heightMaxSize
+		if a.ClipHasQwen2VLMerger ||
+			a.ClipProjectorType == "qwen2vl_merger" ||
+			a.ClipProjectorType == "qwen2.5vl_merger" ||
+			a.ClipProjectorType == "qwen2.5o" ||
+			a.ClipProjectorType == "pixtral" {
+			// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L2217.
+			heightMaxSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 1024))
+			widthMaxSize = heightMaxSize
 		}
-		nPatchesHeight = imgHeightSize / imgPatchSize
-		nPatchesWidth = imgWidthSize / imgPatchSize
+		nPatchSize := uint64(a.ClipVisionPatchSize)
+		nPatchesHeight := heightMaxSize / nPatchSize
+		nPatchesWidth := widthMaxSize / nPatchSize
 		nPatches = nPatchesHeight * nPatchesWidth
-		imgPatchesMaxSize = 1
-		imgPatches = nPatches
+		patchesMaxSize = 1
 		switch {
-		case a.ClipHasLLaVAProjector:
+		case a.ClipHasLLaVAProjector ||
+			a.ClipProjectorType == "mlp" ||
+			a.ClipProjectorType == "mlp_norm" ||
+			a.ClipProjectorType == "ldp" ||
+			a.ClipProjectorType == "ldpv2":
 			// LLaVA 1.6 uses up to 6 patches
 			if a.ClipVisionMMPatchMergeType != "flat" {
-				imgPatchesMaxSize = 6
+				patchesMaxSize = 6
 			}
-		case a.ClipHasMiniCPMVProjector:
+		case a.ClipHasMiniCPMVProjector ||
+			a.ClipProjectorType == "resampler":
 			// MiniCPM-V uses up to 10 patches
-			imgPatchesMaxSize = 10
+			patchesMaxSize = 10
 		case a.ClipProjectorType == "adapter":
 			// Granite vision uses up to 10 patches + base patch
-			imgPatchesMaxSize = 11
+			patchesMaxSize = 11
+		}
+
+		if o.LMCMaxProjectedCache != nil {
+			patchesMaxSize += uint64(*o.LMCMaxProjectedCache)
 		}
+
 		switch a.ClipProjectorType {
 		case "ldp":
-			imgPatches /= 4
+			nPatches /= 4
 			if ti, ok := gf.TensorInfos.Get("mm.model.mb_block.1.block.2.1.bias"); ok {
 				projectionDim = ti.Dimensions[0]
 			}
 		case "ldpv2":
-			imgPatches /= 4
+			nPatches /= 4
 			if ti, ok := gf.TensorInfos.Get("mm.model.peg.0.bias"); ok {
 				projectionDim = ti.Dimensions[0]
 			}
@@ -805,142 +1074,208 @@ func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a
 			}
 		case "resampler":
 			if ti, ok := gf.TensorInfos.Get("resampler.query"); ok {
-				imgPatches = ti.Dimensions[1]
+				nPatches = ti.Dimensions[1]
 				projectionDim = ti.Dimensions[0]
 			}
 		case "adapter":
+			nPatches /= 4
+			nPatches += 2
 			if ti, ok := gf.TensorInfos.Get("adapter.linear.dense_4h_to_h.weight"); ok {
 				projectionDim = ti.Dimensions[1]
 			}
-		case "qwen2vl_merger":
+		case "qwen2vl_merger", "qwen2.5vl_merger", "qwen2.5o":
 			nSizePatch := uint64(a.ClipVisionPatchSize * 2)
-			imgHeightPatchSize := imgHeightSize / nSizePatch
-			if imgHeightSize%nSizePatch > 0 {
-				imgHeightPatchSize++
+			heightPatchSize := heightMaxSize / nSizePatch
+			if heightMaxSize%nSizePatch > 0 {
+				heightPatchSize++
 			}
-			imgWidthPatchSize := imgWidthSize / nSizePatch
-			if imgWidthSize%nSizePatch > 0 {
-				imgWidthPatchSize++
+			widthPatchSize := widthMaxSize / nSizePatch
+			if widthMaxSize%nSizePatch > 0 {
+				widthPatchSize++
 			}
-			imgPatches = imgHeightPatchSize * imgWidthPatchSize
+			nPatches = heightPatchSize * widthPatchSize
 			if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok {
 				projectionDim = ti.Dimensions[0]
 			}
 		case "gemma3":
+			nPerSide := uint64(a.ClipVisionImageSize) / uint64(a.ClipVisionPatchSize)
+			nPerSide2DPool := nPerSide / uint64(a.ClipVisionProjectorScaleFactor)
+			nPatches = nPerSide2DPool * nPerSide2DPool
 			if ti, ok := gf.TensorInfos.Get("mm.input_projection.weight"); ok {
-				imgPatches = 256
 				projectionDim = ti.Dimensions[0]
 			}
+		case "idefics3", "llama4":
+			nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor)
+			if ti, ok := gf.TensorInfos.Get("mm.model.fc.weight"); ok {
+				projectionDim = ti.Dimensions[1]
+			}
+		case "pixtral":
+			heightPatchSize := heightMaxSize / uint64(a.ClipVisionPatchSize)
+			if a.ClipVisionSpatialMergeSize > 0 {
+				heightPatchSize /= uint64(a.ClipVisionSpatialMergeSize)
+			}
+			widthPatchSize := widthMaxSize / uint64(a.ClipVisionPatchSize)
+			if a.ClipVisionSpatialMergeSize > 0 {
+				widthPatchSize /= uint64(a.ClipVisionSpatialMergeSize)
+			}
+			nPatches = heightPatchSize*widthPatchSize + heightPatchSize - 1 /* [IMG_BREAK] per row */
+			if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok {
+				projectionDim = ti.Dimensions[0]
+			}
+		case "internvl":
+			nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor)
+			if ti, ok := gf.TensorInfos.Get("mm.model.mlp.3.weight"); ok {
+				projectionDim = ti.Dimensions[1]
+			}
 		}
-	}
-
-	// Footprint.
-	{
-		// Bootstrap.
-		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
 
-		// Image Embed,
-		// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407.
-		e.Devices[0].Footprint += GGUFBytesScalar(imgPatchesMaxSize * imgPatches * projectionDim * 4 /* float32 size */)
-	}
+		// Footprint
+		{
+			// Image Embed,
+			// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407.
+			e.Devices[0].Footprint += GGUFBytesScalar(patchesMaxSize * nPatches * projectionDim * 4 /* float32 size */)
+		}
 
-	idx := 0 // Default to the main host's RAM.
-	if *o.LMCOffloadLayers != 0 {
-		for i := 1; i < len(e.Devices); i++ {
-			if !e.Devices[i].Remote {
-				idx = i
-				break
+		// Computation.
+		{
+			// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374.
+			var maxNodes uint64 = 8192
+
+			// Bootstrap, compute metadata.
+			cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
+			e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm)
+
+			// Scheduler overhead,
+			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
+			e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)
+
+			// GGML context,
+			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
+			gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipVisionBlockCount*3)
+			e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)
+
+			// Tensor usage.
+			var (
+				hasClassEmbd bool
+				nPositions   uint64
+				nBatch       uint64
+				nEmbd        uint64
+				nHead        uint64
+			)
+			{
+				_, hasClassEmbd = ipLs.Get("v.class_embd")
+				nPositions = nPatches
+				if hasClassEmbd {
+					nPositions += 1
+				}
+				if a.ClipHasQwen2VLMerger ||
+					a.ClipProjectorType == "qwen2vl_merger" ||
+					a.ClipProjectorType == "qwen2.5vl_merger" ||
+					a.ClipProjectorType == "qwen2.5o" {
+					nPositions *= 4
+				}
+				nBatch = 1
+				nEmbd = a.ClipVisionEmbeddingLength
+				nHead = a.ClipVisionAttentionHeadCount
+			}
+			// First, get the usage of input layer.
+			{
+				var (
+					inpRaw     = GGMLTypeF32.RowSizeOf([]uint64{widthMaxSize, heightMaxSize, 3, nBatch}) // F32 [img_width, img_height, 3, n_batch]
+					inpRawCnt  = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch})                // I32 [n_patches, n_embd, n_batch]
+					inpEmbd    = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch})              // F32 [n_embd, n_positions, n_batch]
+					inpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatches, nBatch})        // F32 [mmproj, n_patches, n_batch]
+					inpPos     = GGMLTypeI32.RowSizeOf([]uint64{nPositions})                             // I32 [n_positions]
+					inpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches})                               // I32 [n_patches]
+				)
+				e.Devices[idx].Computation.Input += GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches)
+				if a.ClipHasMiniCPMVProjector ||
+					a.ClipProjectorType == "resampler" {
+					e.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd)
+				}
+				if hasClassEmbd {
+					e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd)
+				}
+				if a.ClipVisionWindowAttentionPattern > 0 { // Qwen2.5 VL
+					inpWindowIndex := GGMLTypeI32.RowSizeOf([]uint64{nPatches})              // I32 [n_patches]
+					inpWindowMask := GGMLTypeI32.RowSizeOf([]uint64{nPositions, nPositions}) // I32 [n_positions, n_positions]
+					e.Devices[idx].Computation.Input += GGUFBytesScalar(inpWindowIndex + inpWindowMask)
+				}
+			}
+			// Since the steps between transformer layers are serial,
+			// the allocated memory can be reused for the next layer.
+			// So, we only consider the usage of a certain layer.
+			{
+				compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2
+				compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
+				compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
+				compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})
+				e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)
 			}
 		}
 	}
 
-	// Weight & Parameter.
-	{
-		// Compute.
-		e.Devices[idx].HandleLayers = *o.LMCOffloadLayers
-		e.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1)
-		e.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes())
-		e.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements())
-
-		// IO.
-		e.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes())
-		e.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements())
-		e.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes())
-		e.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements())
-	}
-
-	// Computation.
-	{
-		// Bootstrap, compute metadata,
-		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136.
-		cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum +
-			GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false)
-		e.Devices[0].Computation.Footprint = GGUFBytesScalar(cm)
-
-		// Scheduler overhead,
-		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
-		e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)
-
-		// GGML context,
-		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
-		gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3)
-		e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)
-
-		// Tensor usage.
-		var (
-			hasClassEmbd bool
-			nPositions   uint64
-			nPositionIDs uint64
-			nBatch       uint64
-			nEmbd        uint64
-			nHead        uint64
-		)
+	if a.ClipHasAudioEncoder {
+		// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/mtmd-audio.cpp#L311.
+		var projectionDim uint64 // NB(thxCode): do not sure if there is the correct name.
 		{
-			_, hasClassEmbd = ipLs.Get("v.class_embd")
-			nPositions = nPatches
-			if hasClassEmbd {
-				nPositions += 1
-			}
-			nPositionIDs = nPositions
-			if a.ClipHasQwen2VLMerger {
-				nPositionIDs *= 4
+			if ti, ok := gf.TensorInfos.Get("a.position_embd.weight"); ok {
+				projectionDim = ti.Dimensions[1]
 			}
-			nBatch = 1
-			nEmbd = a.EmbeddingLength
-			nHead = a.AttentionHeadCount
 		}
-		// First, get the usage of input layer.
-		var (
-			inpRaw     = GGMLTypeF32.RowSizeOf([]uint64{imgWidthSize, imgHeightSize, 3, nBatch})                // F32 [img_width, img_height, 3, n_batch]
-			inpRawCnt  = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch})                               // I32 [n_patches, n_embd, n_batch]
-			inpEmbd    = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch})                             // F32 [n_embd, n_positions, n_batch]
-			inpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatchesHeight * nPatchesWidth, nBatch}) // F32 [mmproj, pos_h * pos_w, n_batch]
-			inpPos     = GGMLTypeI32.RowSizeOf([]uint64{nPositionIDs})                                          // I32 [n_positions]
-			inpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches})                                              // I32 [n_patches]
-		)
+
+		// Computation.
 		{
-			e.Devices[idx].Computation.Input = GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches)
-			if a.ClipHasMiniCPMVProjector {
-				e.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd)
+			// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374.
+			var maxNodes uint64 = 8192
+
+			// Bootstrap, compute metadata.
+			cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
+			e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm)
+
+			// Scheduler overhead,
+			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
+			e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)
+
+			// GGML context,
+			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
+			gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipAudioBlockCount*3)
+			e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)
+
+			// Tensor usage.
+			var (
+				nPositions uint64
+				nBatch     uint64
+				nEmbd      uint64
+				nHead      uint64
+			)
+			{
+				nPositions = projectionDim
+				nBatch = 1
+				nEmbd = a.ClipAudioEmbeddingLength
+				nHead = a.ClipAudioAttentionHeadCount
 			}
-			if hasClassEmbd {
+			// First, get the usage of input layer.
+			{
+				inpEmbd := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embed, n_positions, n_batch]
 				e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd)
 			}
-		}
-		// Since the steps between transformer layers are serial,
-		// the allocated memory can be reused for the next layer.
-		// So, we only consider the usage of a certain layer.
-		{
-			compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2
-			compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
-			compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
-			compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})
-			e.Devices[idx].Computation.Compute = GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)
+			// Since the steps between transformer layers are serial,
+			// the allocated memory can be reused for the next layer.
+			// So, we only consider the usage of a certain layer.
+			{
+				compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
+				compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
+				compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
+				compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})
+				e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)
+			}
 		}
 	}
 }
 
-func (gf *GGUFFile) estimateLLaMaCppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
+// estimateLLaMACppRunInAdapter estimates the usages of the GGUF file for adapter.
+func (gf *GGUFFile) estimateLLaMACppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
 	ls := gf.Layers()
 	ioLs, tfLs, _ := ls.Cut([]string{
 		"position_*",
@@ -1048,7 +1383,7 @@ func (gf *GGUFFile) estimateLLaMaCppRunInAdapter(o *_GGUFRunEstimateOptions, a *
 		if _, ok := opLs.Get("output.weight"); ok {
 			wg = GGUFBytesScalar(opLs.Bytes())
 			ps = GGUFParametersScalar(opLs.Elements())
-		} else if a.AttentionCausal {
+		} else {
 			wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */
 			ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements())
 		}
@@ -1062,6 +1397,36 @@ func (gf *GGUFFile) estimateLLaMaCppRunInAdapter(o *_GGUFRunEstimateOptions, a *
 	}
 }
 
+// estimateLLaMACppRunInIMatrix estimates the usages of the GGUF file for imatrix.
+func (gf *GGUFFile) estimateLLaMACppRunInIMatrix(_ *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
+	ls := gf.Layers()
+
+	if a.BlockCount == 0 {
+		a.BlockCount = uint64(len(ls))
+	}
+
+	// Distributable.
+	e.Distributable = false
+
+	// Footprint.
+	{
+		// Bootstrap.
+		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
+	}
+
+	// Weight & Parameter.
+	{
+		var (
+			wg GGUFBytesScalar
+			ps GGUFParametersScalar
+		)
+		wg = GGUFBytesScalar(ls.Bytes())
+		ps = GGUFParametersScalar(ls.Elements())
+		e.Devices[0].Weight.Compute = wg
+		e.Devices[0].Parameter.Compute = ps
+	}
+}
+
 // Types for LLaMACpp estimated summary.
 type (
 	// LLaMACppRunEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp.
@@ -1177,6 +1542,7 @@ func (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMA
 			emi.RAM.UMA -= wg
 			if !mmap {
 				emi.RAM.UMA += e.Devices[0].Weight.Output
+				emi.RAM.UMA += e.Devices[0].Weight.ComputeOverridden
 			}
 		}
 
@@ -1277,7 +1643,7 @@ func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVram
 }
 
 func (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar {
-	return u.Input + u.Compute + u.Output
+	return u.Input + u.Compute + u.ComputeOverridden + u.Output
 }
 
 func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar {
@@ -1287,3 +1653,9 @@ func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar {
 func (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar {
 	return u.Footprint + u.Input + max(u.Compute, u.Output)
 }
+
+// ClipAligning returns the aligned value of x to the nearest multiple of n,
+// see https://github.com/ggml-org/llama.cpp/blob/cdf94a18023c92f41808ec874ba577d914674717/tools/mtmd/clip-impl.h#L114-L115.
+func ClipAligning(x, n uint64) uint64 {
+	return ((x + n - 1) / n) * n
+}
diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go
index dc1dd0f2..34cb5228 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go
@@ -70,6 +70,7 @@ type (
 	}
 )
 
+// EstimateStableDiffusionCppRun estimates the usages of the GGUF file in stable-diffusion.cpp.
 func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate) {
 	// Options
 	var o _GGUFRunEstimateOptions
@@ -233,7 +234,7 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption)
 		}
 
 		// Autoencoder.
-		if aeLs != nil {
+		if len(aeLs) != 0 {
 			e.Autoencoder.Devices[aeDevIdx].Weight = GGUFBytesScalar(aeLs.Bytes())
 			e.Autoencoder.Devices[aeDevIdx].Parameter = GGUFParametersScalar(aeLs.Elements())
 		}
@@ -245,10 +246,11 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption)
 
 	// Computation.
 	{
-		// Bootstrap, compute metadata,
-		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136.
-		cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum +
-			GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false)
+		// See https://github.com/leejet/stable-diffusion.cpp/blob/10c6501bd05a697e014f1bee3a84e5664290c489/ggml_extend.hpp#L1058C9-L1058C23.
+		var maxNodes uint64 = 32768
+
+		// Bootstrap, compute metadata.
+		cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
 		e.Devices[0].Computation = GGUFBytesScalar(cm)
 
 		// Work context,
@@ -350,7 +352,7 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption)
 		}
 
 		// Decode usage.
-		if aeLs != nil && !*o.SDCFreeComputeMemoryImmediately {
+		if len(aeLs) != 0 && !*o.SDCFreeComputeMemoryImmediately {
 			// Bootstrap.
 			e.Autoencoder.Devices[aeDevIdx].Footprint += GGUFBytesScalar(100 * 1024 * 1024) /*100 MiB.*/
 
diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go b/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go
index 9dee4bf0..3591ad6f 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go
@@ -1,7 +1,9 @@
 package gguf_parser
 
 import (
+	"regexp"
 	"slices"
+	"strconv"
 
 	"github.com/gpustack/gguf-parser-go/util/ptr"
 )
@@ -14,22 +16,29 @@ type (
 		MainGPUIndex        int
 		RPCServers          []string
 		TensorSplitFraction []float64
+		OverriddenTensors   []*GGUFRunOverriddenTensor
 		DeviceMetrics       []GGUFRunDeviceMetric
 
 		// LLaMACpp (LMC) specific
-		LMCContextSize        *int32
-		LMCInMaxContextSize   bool
-		LMCLogicalBatchSize   *int32
-		LMCPhysicalBatchSize  *int32
-		LMCVisualMaxImageSize *uint32
-		LMCCacheKeyType       *GGMLType
-		LMCCacheValueType     *GGMLType
-		LMCOffloadKVCache     *bool
-		LMCOffloadLayers      *uint64
-		LMCSplitMode          LLaMACppSplitMode
-		LMCProjector          *LLaMACppRunEstimate
-		LMCDrafter            *LLaMACppRunEstimate
-		LMCAdapters           []LLaMACppRunEstimate
+		LMCContextSize                    *int32
+		LMCRoPEFrequencyBase              *float32
+		LMCRoPEFrequencyScale             *float32
+		LMCRoPEScalingType                *string
+		LMCRoPEScalingOriginalContextSize *int32
+		LMCInMaxContextSize               bool
+		LMCLogicalBatchSize               *int32
+		LMCPhysicalBatchSize              *int32
+		LMCVisualMaxImageSize             *uint32
+		LMCMaxProjectedCache              *uint32
+		LMCCacheKeyType                   *GGMLType
+		LMCCacheValueType                 *GGMLType
+		LMCOffloadKVCache                 *bool
+		LMCOffloadLayers                  *uint64
+		LMCSplitMode                      LLaMACppSplitMode
+		LMCFullSizeSWACache               bool
+		LMCProjector                      *LLaMACppRunEstimate
+		LMCDrafter                        *LLaMACppRunEstimate
+		LMCAdapters                       []LLaMACppRunEstimate
 
 		// StableDiffusionCpp (SDC) specific
 		SDCOffloadLayers                *uint64
@@ -44,6 +53,24 @@ type (
 		SDCControlNet                   *StableDiffusionCppRunEstimate
 	}
 
+	// GGUFRunOverriddenTensor holds the overridden tensor information for the estimate.
+	//
+	// When BufferType is CPU,
+	// it indicates that the tensor should be loaded into the CPU memory,
+	// even if it belongs to a GPU offload layer.
+	GGUFRunOverriddenTensor struct {
+		// PatternRegex is the regex pattern to match the tensor name.
+		PatternRegex *regexp.Regexp
+		// BufferType is the buffer type to override,
+		// it can be "CPU", "CUDA0", "Metal" and others.
+		BufferType string
+
+		// _BufferType record parsed buffer type, used internally.
+		_BufferType GGUFRunOverriddenTensorBufferType
+		// _Index record parsed device index, used internally.
+		_Index string
+	}
+
 	// GGUFRunDeviceMetric holds the device metric for the estimate.
 	//
 	// When the device represents a CPU,
@@ -74,6 +101,53 @@ type (
 	GGUFRunEstimateOption func(*_GGUFRunEstimateOptions)
 )
 
+// GGUFRunOverriddenTensorBufferType is the type of the overridden tensor buffer.
+type GGUFRunOverriddenTensorBufferType uint32
+
+const (
+	_ GGUFRunOverriddenTensorBufferType = iota
+	GGUFRunOverriddenTensorBufferTypeCPU
+	GGUFRunOverriddenTensorBufferTypeGPU
+	GGUFRunOverriddenTensorBufferTypeRPC
+	GGUFRunOverriddenTensorBufferTypeUnknown
+)
+
+var (
+	_GGUFRunOverriddenTensorBufferTypeCPURegex       = regexp.MustCompile(`^(CPU|AMX)`)
+	_GGUFRunOverriddenTensorBufferTypeUMAGPURegex    = regexp.MustCompile(`^(Metal|OpenCL)`)
+	_GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex = regexp.MustCompile(`^(CUDA|CANN|ROCm|MUSA|SYCL|Vulkan|Kompute)(\d+)?`)
+	_GGUFRunOverriddenTensorBufferTypeRPCRegex       = regexp.MustCompile(`^RPC\[(.*)\]`)
+)
+
+// ParseBufferType returns the device index of the overridden tensor.
+//
+// The device index is used to determine which device the tensor belongs to,
+// it is according to the buffer type description.
+func (odt *GGUFRunOverriddenTensor) ParseBufferType() (GGUFRunOverriddenTensorBufferType, string) {
+	if odt == nil {
+		return GGUFRunOverriddenTensorBufferTypeUnknown, ""
+	}
+
+	if odt._BufferType == 0 {
+		odt._BufferType = GGUFRunOverriddenTensorBufferTypeUnknown
+		if ms := _GGUFRunOverriddenTensorBufferTypeCPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {
+			odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeCPU, "0"
+		}
+		if ms := _GGUFRunOverriddenTensorBufferTypeUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {
+			odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, "1"
+		}
+		if ms := _GGUFRunOverriddenTensorBufferTypeRPCRegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {
+			odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeRPC, ms[1]
+		}
+		if ms := _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 2 {
+			if idx, err := strconv.ParseInt(ms[2], 10, 64); err == nil && idx >= 0 {
+				odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, ms[2]
+			}
+		}
+	}
+	return odt._BufferType, odt._Index
+}
+
 // WithParallelSize sets the (decoding sequences) parallel size for the estimate.
 func WithParallelSize(size int32) GGUFRunEstimateOption {
 	return func(o *_GGUFRunEstimateOptions) {
@@ -137,6 +211,24 @@ func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption {
 	}
 }
 
+// WithOverriddenTensors sets the overridden tensors for the estimate.
+func WithOverriddenTensors(tensors []GGUFRunOverriddenTensor) GGUFRunEstimateOption {
+	return func(o *_GGUFRunEstimateOptions) {
+		if len(tensors) == 0 {
+			return
+		}
+		for _, t := range tensors {
+			if t.PatternRegex == nil || t.BufferType == "" {
+				return
+			}
+		}
+		o.OverriddenTensors = make([]*GGUFRunOverriddenTensor, len(tensors))
+		for i := range tensors {
+			o.OverriddenTensors[i] = &tensors[i]
+		}
+	}
+}
+
 // WithDeviceMetrics sets the device metrics for the estimate.
 func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption {
 	return func(o *_GGUFRunEstimateOptions) {
@@ -157,6 +249,29 @@ func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption {
 	}
 }
 
+// WithLLaMACppRoPE sets the RoPE parameters for the estimate.
+func WithLLaMACppRoPE(
+	frequencyBase float64,
+	frequencyScale float64,
+	scalingType string,
+	scalingOriginalContextSize int32,
+) GGUFRunEstimateOption {
+	return func(o *_GGUFRunEstimateOptions) {
+		if frequencyBase > 0 {
+			o.LMCRoPEFrequencyBase = ptr.Float32(float32(frequencyBase))
+		}
+		if frequencyScale > 0 {
+			o.LMCRoPEFrequencyScale = ptr.Float32(float32(frequencyScale))
+		}
+		if slices.Contains([]string{"none", "linear", "yarn"}, scalingType) {
+			o.LMCRoPEScalingType = &scalingType
+		}
+		if scalingOriginalContextSize > 0 {
+			o.LMCRoPEScalingOriginalContextSize = ptr.To(scalingOriginalContextSize)
+		}
+	}
+}
+
 // WithinLLaMACppMaxContextSize limits the context size to the maximum,
 // if the context size is over the maximum.
 func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption {
@@ -247,6 +362,13 @@ func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption {
 	}
 }
 
+// WithLLaMACppFullSizeSWACache enables full size sliding window attention cache.
+func WithLLaMACppFullSizeSWACache() GGUFRunEstimateOption {
+	return func(o *_GGUFRunEstimateOptions) {
+		o.LMCFullSizeSWACache = true
+	}
+}
+
 // WithLLaMACppVisualMaxImageSize sets the visual maximum image size input for the estimate.
 func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption {
 	return func(o *_GGUFRunEstimateOptions) {
@@ -257,6 +379,16 @@ func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption {
 	}
 }
 
+// WithLLaMACppMaxProjectedCache sets the maximum projected embedding cache for the estimate.
+func WithLLaMACppMaxProjectedCache(cacheSize uint32) GGUFRunEstimateOption {
+	return func(o *_GGUFRunEstimateOptions) {
+		if cacheSize == 0 {
+			return
+		}
+		o.LMCMaxProjectedCache = ptr.To(cacheSize)
+	}
+}
+
 // WithLLaMACppDrafter sets the drafter estimate usage.
 func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption {
 	return func(o *_GGUFRunEstimateOptions) {
diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go b/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go
index 13e39e05..bc569e1e 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go
@@ -2,6 +2,7 @@ package gguf_parser
 
 import (
 	"regexp"
+	"slices"
 	"sort"
 	"strings"
 
@@ -53,6 +54,10 @@ type GGUFMetadata struct {
 	License string `json:"license,omitempty"`
 	// FileType describes the type of the majority of the tensors in the GGUF file.
 	FileType GGUFFileType `json:"fileType"`
+	// FileTypeDescriptor describes the type of the GGUF file according to the FileType and trait layer.
+	//
+	// This supplies the FileType with more detail.
+	FileTypeDescriptor string `json:"fileTypeDetail"`
 
 	/* Appendix */
 
@@ -70,52 +75,85 @@ type GGUFMetadata struct {
 }
 
 // GGUFFileType is a type of GGUF file,
-// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/ggml/include/ggml.h#L404-L433.
+// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L419-L445,
+// and https://github.com/huggingface/huggingface.js/blob/d67a464473ca07fee9811a129e5fac8cc7487098/packages/tasks/src/gguf.ts#L4-L52.
 type GGUFFileType uint32
 
 // GGUFFileType constants.
 //
 // GGUFFileTypeMostlyQ4_2, GGUFFileTypeMostlyQ4_3 are deprecated.
+// GGUFFileTypeMostlyQ4_0_4_4, GGUFFileTypeMostlyQ4_0_4_8, GGUFFileTypeMostlyQ4_0_8_8 are deprecated.
 //
-// GGUFFileTypeMostlyQ4_1_F16 is a special case where the majority of the tensors are Q4_1,
+// GGUFFileTypeMostlyQ4_1_SOME_F16 is a special case where the majority of the tensors are Q4_1,
 // but 'token_embd.weight' and 'output.weight' tensors are F16.
 const (
-	GGUFFileTypeAllF32           GGUFFileType = iota // F32
-	GGUFFileTypeMostlyF16                            // F16
-	GGUFFileTypeMostlyQ4_0                           // Q4_0
-	GGUFFileTypeMostlyQ4_1                           // Q4_1
-	GGUFFileTypeMostlyQ4_1_F16                       // Q4_1_F16
-	GGUFFileTypeMostlyQ4_2                           // Q4_2
-	GGUFFileTypeMostlyQ4_3                           // Q4_3
-	GGUFFileTypeMostlyQ8_0                           // Q8_0
-	GGUFFileTypeMostlyQ5_0                           // Q5_0
-	GGUFFileTypeMostlyQ5_1                           // Q5_1
-	GGUFFileTypeMostlyQ2_K                           // Q2_K
-	GGUFFileTypeMostlyQ3_K                           // Q3_K/Q3_K_S
-	GGUFFileTypeMostlyQ4_K                           // Q4_K/Q3_K_M
-	GGUFFileTypeMostlyQ5_K                           // Q5_K/Q3_K_L
-	GGUFFileTypeMostlyQ6_K                           // Q6_K/Q4_K_S
-	GGUFFileTypeMostlyIQ2_XXS                        // IQ2_XXS/Q4_K_M
-	GGUFFileTypeMostlyIQ2_XS                         // IQ2_XS/Q5_K_S
-	GGUFFileTypeMostlyIQ3_XXS                        // IQ3_XXS/Q5_K_M
-	GGUFFileTypeMostlyIQ1_S                          // IQ1_S/Q6_K
-	GGUFFileTypeMostlyIQ4_NL                         // IQ4_NL
-	GGUFFileTypeMostlyIQ3_S                          // IQ3_S
-	GGUFFileTypeMostlyIQ2_S                          // IQ2_S
-	GGUFFileTypeMostlyIQ4_XS                         // IQ4_XS
-	GGUFFileTypeMostlyIQ1_M                          // IQ1_M
-	GGUFFileTypeMostlyBF16                           // BF16
-	GGUFFileTypeMostlyQ4_0_4_4                       // Q4_0_4x4
-	GGUFFileTypeMostlyQ4_0_4_8                       // Q4_0_4x8
-	GGUFFileTypeMostlyQ4_0_8_8                       // Q4_0_8x8
-	GGUFFileTypeMostlyTQ1_0                          // TQ1_0
-	GGUFFileTypeMostlyTQ2_0                          // TQ2_0
-	GGUFFileTypeMostlyIQ4_NL_4_4                     // IQ4_NL_4x4
-	GGUFFileTypeMostlyIQ4_NL_4_8                     // IQ4_NL_4x8
-	GGUFFileTypeMostlyIQ4_NL_8_8                     // IQ4_NL_8x8
-	_GGUFFileTypeCount                               // Unknown
+	GGUFFileTypeMostlyF32           GGUFFileType = iota // MOSTLY_F32
+	GGUFFileTypeMostlyF16                               // MOSTLY_F16
+	GGUFFileTypeMostlyQ4_0                              // MOSTLY_Q4_0
+	GGUFFileTypeMostlyQ4_1                              // MOSTLY_Q4_1
+	GGUFFileTypeMostlyQ4_1_SOME_F16                     // MOSTLY_Q4_1_SOME_F16
+	GGUFFileTypeMostlyQ4_2                              // MOSTLY_Q4_2
+	GGUFFileTypeMostlyQ4_3                              // MOSTLY_Q4_3
+	GGUFFileTypeMostlyQ8_0                              // MOSTLY_Q8_0
+	GGUFFileTypeMostlyQ5_0                              // MOSTLY_Q5_0
+	GGUFFileTypeMostlyQ5_1                              // MOSTLY_Q5_1
+	GGUFFileTypeMostlyQ2_K                              // MOSTLY_Q2_K
+	GGUFFileTypeMostlyQ3_K_S                            // MOSTLY_Q3_K_S
+	GGUFFileTypeMostlyQ3_K_M                            // MOSTLY_Q3_K_M
+	GGUFFileTypeMostlyQ3_K_L                            // MOSTLY_Q3_K_L
+	GGUFFileTypeMostlyQ4_K_S                            // MOSTLY_Q4_K_S
+	GGUFFileTypeMostlyQ4_K_M                            // MOSTLY_Q4_K_M
+	GGUFFileTypeMostlyQ5_K_S                            // MOSTLY_Q5_K_S
+	GGUFFileTypeMostlyQ5_K_M                            // MOSTLY_Q5_K_M
+	GGUFFileTypeMostlyQ6_K                              // MOSTLY_Q6_K
+	GGUFFileTypeMostlyIQ2_XXS                           // MOSTLY_IQ2_XXS
+	GGUFFileTypeMostlyIQ2_XS                            // MOSTLY_IQ2_XS
+	GGUFFileTypeMostlyQ2_K_S                            // MOSTLY_Q2_K_S
+	GGUFFileTypeMostlyIQ3_XS                            // MOSTLY_IQ3_XS
+	GGUFFileTypeMostlyIQ3_XXS                           // MOSTLY_IQ3_XXS
+	GGUFFileTypeMostlyIQ1_S                             // MOSTLY_IQ1_S
+	GGUFFileTypeMostlyIQ4_NL                            // MOSTLY_IQ4_NL
+	GGUFFileTypeMostlyIQ3_S                             // MOSTLY_IQ3_S
+	GGUFFileTypeMostlyIQ3_M                             // MOSTLY_IQ3_M
+	GGUFFileTypeMostlyIQ2_S                             // MOSTLY_IQ2_S
+	GGUFFileTypeMostlyIQ2_M                             // MOSTLY_IQ2_M
+	GGUFFileTypeMostlyIQ4_XS                            // MOSTLY_IQ4_XS
+	GGUFFileTypeMostlyIQ1_M                             // MOSTLY_IQ1_M
+	GGUFFileTypeMostlyBF16                              // MOSTLY_BF16
+	GGUFFileTypeMostlyQ4_0_4_4                          // MOSTLY_Q4_0_4_4
+	GGUFFileTypeMostlyQ4_0_4_8                          // MOSTLY_Q4_0_4_8
+	GGUFFileTypeMostlyQ4_0_8_8                          // MOSTLY_Q4_0_8_8
+	GGUFFileTypeMostlyTQ1_0                             // MOSTLY_TQ1_0
+	GGUFFileTypeMostlyTQ2_0                             // MOSTLY_TQ2_0
+	GGUFFileTypeMostlyMXFP4                             // MOSTLY_MXFP4
+	_GGUFFileTypeCount                                  // Unknown
 )
 
+// _GGUFPotentialDiffusionArchitectures holds a list representing the potential diffusion architectures.
+//
+// Since we will unify all diffusion architectures to "diffusion" during processing,
+// we can use this list to match the value in explicit `general.architecture`.
+var _GGUFPotentialDiffusionArchitectures = []string{
+	"flux",
+	"sd",
+	"sd2.5",
+	"sd3",
+	"stable-diffusion",
+}
+
+// _GGUFPotentialDiffusionArchitectureTensorsRegexes holds a list of regexes to match the potential diffusion architecture tensors.
+//
+// This is used to detect if the GGUF file is a diffusion model,
+// when the `general.architecture` is not set to a known diffusion architecture.
+var _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{
+	regexp.MustCompile(`^model\.diffusion_model\..*`),
+	regexp.MustCompile(`^double_blocks\..*`),
+	regexp.MustCompile(`^joint_blocks\..*`),
+	regexp.MustCompile(`^decoder\..*`),
+	regexp.MustCompile(`^encoder\..*`),
+	regexp.MustCompile(`^text_model\..*`),
+}
+
 // Metadata returns the metadata of the GGUF file.
 func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
 	const (
@@ -128,13 +166,10 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
 		urlKey          = "general.url"
 		descriptionKey  = "general.description"
 		licenseKey      = "general.license"
-		fileTypeKey     = "general.file_type"
 
 		controlVectorModelHintKey = "controlvector.model_hint"
 	)
 
-	gm.FileType = _GGUFFileTypeCount
-
 	m, _ := gf.Header.MetadataKV.Index([]string{
 		typeKey,
 		architectureKey,
@@ -145,7 +180,6 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
 		urlKey,
 		descriptionKey,
 		licenseKey,
-		fileTypeKey,
 		controlVectorModelHintKey,
 	})
 
@@ -158,17 +192,20 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
 	}
 	if v, ok := m[controlVectorModelHintKey]; ok {
 		gm.Architecture = v.ValueString()
-	} else if v, ok = m[architectureKey]; ok {
+	} else if v, ok = m[architectureKey]; ok && !slices.Contains(_GGUFPotentialDiffusionArchitectures, v.ValueString()) {
 		gm.Architecture = v.ValueString()
 		if gm.Architecture == "clip" {
 			gm.Type = "projector"
 		}
+	} else if gm.Type == "imatrix" {
+		gm.Architecture = "imatrix" // Default to imatrix.
 	} else {
-		if gf.TensorInfos.Match(regexp.MustCompile(`^model\.diffusion_model\..*`)) ||
-			gf.TensorInfos.Match(regexp.MustCompile(`^double_blocks\..*`)) {
-			gm.Architecture = "diffusion"
-		} else {
-			gm.Architecture = "llama"
+		gm.Architecture = "llama" // Default to llama.
+		for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes {
+			if gf.TensorInfos.Match(re) {
+				gm.Architecture = "diffusion"
+				break
+			}
 		}
 	}
 	if v, ok := m[quantizationKey]; ok {
@@ -194,13 +231,7 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
 	if v, ok := m[licenseKey]; ok {
 		gm.License = v.ValueString()
 	}
-	if v, ok := m[fileTypeKey]; ok {
-		gm.FileType = GGUFFileType(ValueNumeric[uint32](v))
-	}
-
-	if gm.FileType >= _GGUFFileTypeCount {
-		gm.FileType = gf.guessFileType(gm.Architecture)
-	}
+	gm.FileType, gm.FileTypeDescriptor = gf.extractFileType(gm.Architecture)
 
 	gm.LittleEndian = gf.Header.Version < GGUFVersionV3 || gf.Header.Magic == GGUFMagicGGUFLe
 	gm.FileSize = gf.Size
@@ -216,7 +247,7 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
 // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2730-L2763.
 func (t GGUFFileType) GGMLType() GGMLType {
 	switch t {
-	case GGUFFileTypeAllF32:
+	case GGUFFileTypeMostlyF32:
 		return GGMLTypeF32
 	case GGUFFileTypeMostlyF16:
 		return GGMLTypeF16
@@ -224,6 +255,8 @@ func (t GGUFFileType) GGMLType() GGMLType {
 		return GGMLTypeQ4_0
 	case GGUFFileTypeMostlyQ4_1:
 		return GGMLTypeQ4_1
+	case GGUFFileTypeMostlyQ4_1_SOME_F16:
+		return GGMLTypeQ4_1
 	case GGUFFileTypeMostlyQ4_2:
 		return GGMLTypeQ4_2
 	case GGUFFileTypeMostlyQ4_3:
@@ -236,11 +269,19 @@ func (t GGUFFileType) GGMLType() GGMLType {
 		return GGMLTypeQ5_1
 	case GGUFFileTypeMostlyQ2_K:
 		return GGMLTypeQ2_K
-	case GGUFFileTypeMostlyQ3_K:
+	case GGUFFileTypeMostlyQ3_K_S:
 		return GGMLTypeQ3_K
-	case GGUFFileTypeMostlyQ4_K:
+	case GGUFFileTypeMostlyQ3_K_M:
+		return GGMLTypeQ4_K
+	case GGUFFileTypeMostlyQ3_K_L:
+		return GGMLTypeQ5_K
+	case GGUFFileTypeMostlyQ4_K_S:
+		return GGMLTypeQ6_K
+	case GGUFFileTypeMostlyQ4_K_M:
 		return GGMLTypeQ4_K
-	case GGUFFileTypeMostlyQ5_K:
+	case GGUFFileTypeMostlyQ5_K_S:
+		return GGMLTypeQ5_K
+	case GGUFFileTypeMostlyQ5_K_M:
 		return GGMLTypeQ5_K
 	case GGUFFileTypeMostlyQ6_K:
 		return GGMLTypeQ6_K
@@ -248,6 +289,10 @@ func (t GGUFFileType) GGMLType() GGMLType {
 		return GGMLTypeIQ2_XXS
 	case GGUFFileTypeMostlyIQ2_XS:
 		return GGMLTypeIQ2_XS
+	case GGUFFileTypeMostlyQ2_K_S:
+		return GGMLTypeQ2_K
+	case GGUFFileTypeMostlyIQ3_XS:
+		return GGMLTypeIQ3_S
 	case GGUFFileTypeMostlyIQ3_XXS:
 		return GGMLTypeIQ3_XXS
 	case GGUFFileTypeMostlyIQ1_S:
@@ -256,7 +301,11 @@ func (t GGUFFileType) GGMLType() GGMLType {
 		return GGMLTypeIQ4_NL
 	case GGUFFileTypeMostlyIQ3_S:
 		return GGMLTypeIQ3_S
+	case GGUFFileTypeMostlyIQ3_M:
+		return GGMLTypeIQ3_S
 	case GGUFFileTypeMostlyIQ2_S:
+		return GGMLTypeIQ2_XS
+	case GGUFFileTypeMostlyIQ2_M:
 		return GGMLTypeIQ2_S
 	case GGUFFileTypeMostlyIQ4_XS:
 		return GGMLTypeIQ4_XS
@@ -274,39 +323,122 @@ func (t GGUFFileType) GGMLType() GGMLType {
 		return GGMLTypeTQ1_0
 	case GGUFFileTypeMostlyTQ2_0:
 		return GGMLTypeTQ2_0
-	case GGUFFileTypeMostlyIQ4_NL_4_4:
-		return GGMLTypeIQ4_NL_4_4
-	case GGUFFileTypeMostlyIQ4_NL_4_8:
-		return GGMLTypeIQ4_NL_4_8
-	case GGUFFileTypeMostlyIQ4_NL_8_8:
-		return GGMLTypeIQ4_NL_8_8
+	case GGUFFileTypeMostlyMXFP4:
+		return GGMLTypeMXFP4
 	default:
 	}
 	return _GGMLTypeCount
 }
 
-// guessFileType guesses the GGUF file type by
-// statistically analyzing the tensor types,
-// which is inspired by
-// https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML#provided-files.
-func (gf *GGUFFile) guessFileType(arch string) GGUFFileType {
-	if len(gf.TensorInfos) == 0 {
-		return _GGUFFileTypeCount
+// extractFileType extracts the GGUF file type from the metadata,
+// it tries to return the descriptor of the file type.
+func (gf *GGUFFile) extractFileType(arch string) (fileType GGUFFileType, fileTypeDescriptor string) {
+	fileType, fileTypeDescriptor = _GGUFFileTypeCount, "Unknown"
+
+	const fileTypeKey = "general.file_type"
+	m, _ := gf.Header.MetadataKV.Index([]string{
+		fileTypeKey,
+	})
+	if v, ok := m[fileTypeKey]; ok {
+		fileType = GGUFFileType(ValueNumeric[uint32](v))
+	}
+
+	if fileType == _GGUFFileTypeCount {
+		// Guess.
+		if len(gf.TensorInfos) != 0 {
+			cm := make(map[GGMLType]int)
+			for i := range gf.TensorInfos {
+				switch {
+				case arch != "diffusion" &&
+					!strings.HasPrefix(gf.TensorInfos[i].Name, "token_embd") &&
+					!strings.HasPrefix(gf.TensorInfos[i].Name, "blk.") &&
+					!strings.Contains(gf.TensorInfos[i].Name, "_norm") &&
+					!strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"):
+					continue
+				case arch == "diffusion" &&
+					!strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"):
+					continue
+				}
+				cm[gf.TensorInfos[i].Type]++
+			}
+			fileType = GetFileType(cm)
+		}
 	}
+	if fileType == _GGUFFileTypeCount {
+		return fileType, fileTypeDescriptor
+	}
+
+	fileTypeDescriptor = strings.TrimPrefix(fileType.String(), "MOSTLY_")
 
-	// Count.
-	cm := make(map[GGMLType]int)
-	for i := range gf.TensorInfos {
-		switch {
-		case arch != "diffusion" && !strings.HasPrefix(gf.TensorInfos[i].Name, "blk."):
-			continue
-		case arch == "diffusion" && !strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"):
-			continue
+	const tokenEmbedWeightTensorName = "token_embd.weight"
+
+	switch fileType {
+	case GGUFFileTypeMostlyQ4_0:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 {
+				fileTypeDescriptor = "Q4_0_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ4_1:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 {
+				fileTypeDescriptor = "Q4_1_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ5_0:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 {
+				fileTypeDescriptor = "Q5_0_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ5_1:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 {
+				fileTypeDescriptor = "Q5_1_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ2_K:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ4_K {
+				fileTypeDescriptor = "Q2_K_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ3_K_M:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 {
+				fileTypeDescriptor = "Q3_K_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ4_K_M:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 {
+				fileTypeDescriptor = "Q4_K_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ5_K_M:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 {
+				fileTypeDescriptor = "Q5_K_L"
+			}
+		}
+	case GGUFFileTypeMostlyQ6_K:
+		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
+		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
+			if v.Type == GGMLTypeQ8_0 {
+				fileTypeDescriptor = "Q6_K_L"
+			}
 		}
-		cm[gf.TensorInfos[i].Type]++
 	}
 
-	return GetFileType(cm)
+	return fileType, fileTypeDescriptor
 }
 
 // GetFileType returns the GGUFFileType represented the mostly GGMLType of the given tensors counter.
@@ -326,7 +458,7 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType {
 	// Guess.
 	if ts[0] == GGMLTypeF32 {
 		if len(ts) == 1 {
-			return GGUFFileTypeAllF32
+			return GGUFFileTypeMostlyF32
 		}
 		ts[0] = ts[1]
 	}
@@ -348,42 +480,54 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType {
 	case GGMLTypeQ8_0:
 		return GGUFFileTypeMostlyQ8_0
 	case GGMLTypeQ2_K:
+		if ts[len(ts)-1] == GGMLTypeQ5_K {
+			return GGUFFileTypeMostlyQ2_K_S
+		}
 		return GGUFFileTypeMostlyQ2_K
 	case GGMLTypeQ3_K:
-		switch ts[1] {
-		case GGMLTypeQ4_K: // Legacy, Q3_K_M.
-			return GGUFFileTypeMostlyQ4_K
-		case GGMLTypeQ5_K: // Legacy, Q3_K_L.
-			return GGUFFileTypeMostlyQ5_K
-		default: // Legacy. Q3_K_S
-			return GGUFFileTypeMostlyQ3_K
+		if cm[GGMLTypeQ8_0] > 0 ||
+			(cm[GGMLTypeQ5_K] > 1 && cm[GGMLTypeQ4_K] == 0) {
+			return GGUFFileTypeMostlyQ3_K_L
+		}
+		if cm[GGMLTypeQ4_K] > 1 {
+			return GGUFFileTypeMostlyQ3_K_M
 		}
+		return GGUFFileTypeMostlyQ3_K_S
 	case GGMLTypeQ4_K:
-		if len(ts) > 2 && ts[2] == GGMLTypeQ6_K { // Legacy, Q4_K_M.
-			return GGUFFileTypeMostlyIQ2_XXS
+		if cm[GGMLTypeQ6_K] > 1 {
+			return GGUFFileTypeMostlyQ4_K_M
 		}
-		return GGUFFileTypeMostlyQ6_K // Legacy. Q4_K_S
+		if cm[GGMLTypeQ3_K] > 1 {
+			return GGUFFileTypeMostlyQ3_K_M
+		}
+		return GGUFFileTypeMostlyQ4_K_S
 	case GGMLTypeQ5_K:
-		if len(ts) > 2 && ts[2] == GGMLTypeQ6_K { // Legacy, Q5_K_M.
-			return GGUFFileTypeMostlyIQ3_XXS
+		if cm[GGMLTypeQ6_K] > 1 {
+			return GGUFFileTypeMostlyQ5_K_M
 		}
-		return GGUFFileTypeMostlyIQ2_XS // Legacy. Q5_K_S
+		return GGUFFileTypeMostlyQ5_K_S
 	case GGMLTypeQ6_K:
-		return GGUFFileTypeMostlyIQ1_S // Legacy. Q6_K
+		return GGUFFileTypeMostlyQ6_K
 	case GGMLTypeIQ2_XXS:
 		return GGUFFileTypeMostlyIQ2_XXS
 	case GGMLTypeIQ2_XS:
+		if cm[GGMLTypeIQ4_XS] > 1 {
+			return GGUFFileTypeMostlyIQ2_S
+		}
 		return GGUFFileTypeMostlyIQ2_XS
+	case GGMLTypeIQ2_S:
+		return GGUFFileTypeMostlyIQ2_M
 	case GGMLTypeIQ3_XXS:
 		return GGUFFileTypeMostlyIQ3_XXS
+	case GGMLTypeIQ3_S:
+		if cm[GGMLTypeIQ3_XXS] > 1 {
+			return GGUFFileTypeMostlyIQ3_XS
+		}
+		return GGUFFileTypeMostlyIQ3_S
 	case GGMLTypeIQ1_S:
 		return GGUFFileTypeMostlyIQ1_S
 	case GGMLTypeIQ4_NL:
 		return GGUFFileTypeMostlyIQ4_NL
-	case GGMLTypeIQ3_S:
-		return GGUFFileTypeMostlyIQ3_S
-	case GGMLTypeIQ2_S:
-		return GGUFFileTypeMostlyIQ2_S
 	case GGMLTypeIQ4_XS:
 		return GGUFFileTypeMostlyIQ4_XS
 	case GGMLTypeIQ1_M:
@@ -400,12 +544,8 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType {
 		return GGUFFileTypeMostlyTQ1_0
 	case GGMLTypeTQ2_0:
 		return GGUFFileTypeMostlyTQ2_0
-	case GGMLTypeIQ4_NL_4_4:
-		return GGUFFileTypeMostlyIQ4_NL_4_4
-	case GGMLTypeIQ4_NL_4_8:
-		return GGUFFileTypeMostlyIQ4_NL_4_8
-	case GGMLTypeIQ4_NL_8_8:
-		return GGUFFileTypeMostlyIQ4_NL_8_8
+	case GGMLTypeMXFP4:
+		return GGUFFileTypeMostlyMXFP4
 	default:
 	}
 	return _GGUFFileTypeCount
diff --git a/vendor/github.com/gpustack/gguf-parser-go/ggml.go b/vendor/github.com/gpustack/gguf-parser-go/ggml.go
index 07146935..7e17d376 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/ggml.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/ggml.go
@@ -9,11 +9,11 @@ import (
 // Types for GGMLType.
 type (
 	// GGMLType is a type of GGML tensor,
-	// see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/include/ggml.h#L363-L401.
+	// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L368-L410.
 	GGMLType uint32
 
 	// GGMLTypeTrait holds the trait of a GGMLType,
-	// see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/src/ggml.c#L663-L1082.
+	// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/src/ggml.c#L586-L876.
 	GGMLTypeTrait struct {
 		BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64.
 		TypeSize  uint64 // Original is uint32, in order to reduce conversion, here we use uint64.
@@ -24,6 +24,8 @@ type (
 // GGMLType constants.
 //
 // GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.
+// GGMLTypeQ4_0_4_4, GGMLTypeQ4_0_4_8, GGMLTypeQ4_0_8_8 are deprecated.
+// GGMLTypeIQ4_NL_4_4, GGMLTypeIQ4_NL_4_8, GGMLTypeIQ4_NL_8_8 are deprecated.
 const (
 	GGMLTypeF32 GGMLType = iota
 	GGMLTypeF16
@@ -64,6 +66,7 @@ const (
 	GGMLTypeIQ4_NL_4_4
 	GGMLTypeIQ4_NL_4_8
 	GGMLTypeIQ4_NL_8_8
+	GGMLTypeMXFP4
 	_GGMLTypeCount // Unknown
 )
 
@@ -108,6 +111,7 @@ var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{
 	GGMLTypeIQ4_NL_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true},
 	GGMLTypeIQ4_NL_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
 	GGMLTypeIQ4_NL_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
+	GGMLTypeMXFP4:      {BlockSize: 32, TypeSize: 17, Quantized: true},
 }
 
 // Trait returns the GGMLTypeTrait of the GGMLType.
@@ -186,26 +190,28 @@ const (
 	// GGMLComputationGraphSize is the size of GGML computation graph in bytes.
 	GGMLComputationGraphSize = 80
 
-	// GGMLComputationGraphNodesMaximum is the maximum nodes of the computation graph,
-	// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L103.
-	GGMLComputationGraphNodesMaximum = 8192
-
-	// GGMLComputationGraphNodesDefault is the default nodes of the computation graph,
-	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L237.
-	GGMLComputationGraphNodesDefault = 2048
+	// GGMLComputationBitsetSize is the size of GGML computation bitset in bytes,
+	// see https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-impl.h#L165.
+	GGMLComputationBitsetSize = 4
 )
 
 // GGMLComputationGraphOverhead is the overhead of GGML graph in bytes,
-// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L18905-L18917.
+// see https://github.com/ggml-org/ggml/blob/5592ffda9c417c3c12232c828247c23d17004c88/src/ggml.c#L5941-L5956.
 func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 {
-	const pointerSize = 8
+	const ps = 8 // c++ pointer size
+
+	hs := GGMLHashSize(nodes * 2)
 
-	var g uint64 = GGMLComputationGraphSize
-	g += pointerSize * nodes * 2
+	var g uint64 = GGMLComputationGraphSize // graph
+	g += GGMLPadding(nodes*ps, ps)          // nodes
+	g += GGMLPadding(nodes*ps, ps)          // leafs
+	g += GGMLPadding(nodes*ps, ps)          // parents
+	g += GGMLPadding(hs*ps, ps)             // hash keys
 	if grads {
-		g += pointerSize * nodes
+		g += GGMLPadding(hs*ps, ps) // grads
+		g += GGMLPadding(hs*ps, ps) // grad_accs
 	}
-	g += pointerSize * GGMLHashSize(nodes)
+	g += GGMLPadding(GGMLBitsetSize(hs)*GGMLComputationBitsetSize, GGMLComputationBitsetSize) // bitset
 
 	return GGMLObjectSize + GGMLMemoryPadding(g)
 }
@@ -231,3 +237,9 @@ func GGMLHashSize(base uint64) uint64 {
 	}
 	return primes[i]
 }
+
+// GGMLBitsetSize returns the size of the bitset for the given number of bits,
+// see https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/ggml/src/ggml-impl.h#L166-L171.
+func GGMLBitsetSize(n uint64) uint64 {
+	return (n + (GGMLComputationBitsetSize*8 - 1)) >> 5
+}
diff --git a/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go b/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go
index 45c4cb81..e2672b0b 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go
@@ -36,7 +36,7 @@ const (
 // since llama3.1, the user agent is required to be set,
 // otherwise the request will be rejected by 412.
 func OllamaUserAgent() string {
-	return fmt.Sprintf("ollama/0.3.3 (%s %s) Go/%s", runtime.GOARCH, runtime.GOOS, runtime.Version())
+	return fmt.Sprintf("ollama/9.9.9 (%s %s) Go/%s", runtime.GOARCH, runtime.GOOS, runtime.Version())
 }
 
 // OllamaRegistryAuthorizeRetry returns true if the request should be retried with authorization.
diff --git a/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go b/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go
index 42b10c3a..b1deb782 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go
@@ -3,45 +3,30 @@ package httpx
 import (
 	"context"
 	"net"
-	"time"
-
-	"github.com/rs/dnscache"
 )
 
-// DefaultResolver is the default DNS resolver used by the package,
-// which caches DNS lookups in memory.
-var DefaultResolver = &dnscache.Resolver{
-	// NB(thxCode): usually, a high latency DNS is about 3s,
-	// so we set the timeout to 5s here.
-	Timeout:  5 * time.Second,
-	Resolver: net.DefaultResolver,
-}
-
-func init() {
-	go func() {
-		t := time.NewTimer(5 * time.Minute)
-		defer t.Stop()
-		for range t.C {
-			DefaultResolver.RefreshWithOptions(dnscache.ResolverRefreshOptions{
-				ClearUnused:      true,
-				PersistOnFailure: false,
-			})
-		}
-	}()
-}
-
 func DNSCacheDialContext(dialer *net.Dialer) func(context.Context, string, string) (net.Conn, error) {
+	cs := map[string][]net.IP{}
+
 	return func(ctx context.Context, nw, addr string) (conn net.Conn, err error) {
 		h, p, err := net.SplitHostPort(addr)
 		if err != nil {
 			return nil, err
 		}
-		ips, err := DefaultResolver.LookupHost(ctx, h)
-		if err != nil {
-			return nil, err
+		ips, ok := cs[h]
+		if !ok {
+			ips, err = net.DefaultResolver.LookupIP(ctx, "ip4", h)
+			if len(ips) == 0 {
+				ips, err = net.DefaultResolver.LookupIP(ctx, "ip", h)
+			}
+			if err != nil {
+				return nil, err
+			}
+			cs[h] = ips
 		}
+		// Try to connect to each IP address in order.
 		for _, ip := range ips {
-			conn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip, p))
+			conn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip.String(), p))
 			if err == nil {
 				break
 			}
diff --git a/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go b/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go
index b9879fc0..f7a09caa 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go
@@ -22,7 +22,7 @@ func mmap(f *os.File, size int) ([]byte, error) {
 		return nil, os.NewSyscallError("CloseHandle", err)
 	}
 
-	return (*[maxMapSize]byte)(unsafe.Pointer(uintptr(addr)))[:size], nil
+	return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil
 }
 
 func munmap(b []byte) error {
diff --git a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go
index 3eaad12f..94541571 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go
@@ -47,12 +47,13 @@ func _() {
 	_ = x[GGMLTypeIQ4_NL_4_4-36]
 	_ = x[GGMLTypeIQ4_NL_4_8-37]
 	_ = x[GGMLTypeIQ4_NL_8_8-38]
-	_ = x[_GGMLTypeCount-39]
+	_ = x[GGMLTypeMXFP4-39]
+	_ = x[_GGMLTypeCount-40]
 }
 
-const _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8Unknown"
+const _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8MXFP4Unknown"
 
-var _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 203}
+var _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 201, 208}
 
 func (i GGMLType) String() string {
 	if i >= GGMLType(len(_GGMLType_index)-1) {
diff --git a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go
index a6abaa22..ba7f6385 100644
--- a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go
+++ b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go
@@ -8,45 +8,51 @@ func _() {
 	// An "invalid array index" compiler error signifies that the constant values have changed.
 	// Re-run the stringer command to generate them again.
 	var x [1]struct{}
-	_ = x[GGUFFileTypeAllF32-0]
+	_ = x[GGUFFileTypeMostlyF32-0]
 	_ = x[GGUFFileTypeMostlyF16-1]
 	_ = x[GGUFFileTypeMostlyQ4_0-2]
 	_ = x[GGUFFileTypeMostlyQ4_1-3]
-	_ = x[GGUFFileTypeMostlyQ4_1_F16-4]
+	_ = x[GGUFFileTypeMostlyQ4_1_SOME_F16-4]
 	_ = x[GGUFFileTypeMostlyQ4_2-5]
 	_ = x[GGUFFileTypeMostlyQ4_3-6]
 	_ = x[GGUFFileTypeMostlyQ8_0-7]
 	_ = x[GGUFFileTypeMostlyQ5_0-8]
 	_ = x[GGUFFileTypeMostlyQ5_1-9]
 	_ = x[GGUFFileTypeMostlyQ2_K-10]
-	_ = x[GGUFFileTypeMostlyQ3_K-11]
-	_ = x[GGUFFileTypeMostlyQ4_K-12]
-	_ = x[GGUFFileTypeMostlyQ5_K-13]
-	_ = x[GGUFFileTypeMostlyQ6_K-14]
-	_ = x[GGUFFileTypeMostlyIQ2_XXS-15]
-	_ = x[GGUFFileTypeMostlyIQ2_XS-16]
-	_ = x[GGUFFileTypeMostlyIQ3_XXS-17]
-	_ = x[GGUFFileTypeMostlyIQ1_S-18]
-	_ = x[GGUFFileTypeMostlyIQ4_NL-19]
-	_ = x[GGUFFileTypeMostlyIQ3_S-20]
-	_ = x[GGUFFileTypeMostlyIQ2_S-21]
-	_ = x[GGUFFileTypeMostlyIQ4_XS-22]
-	_ = x[GGUFFileTypeMostlyIQ1_M-23]
-	_ = x[GGUFFileTypeMostlyBF16-24]
-	_ = x[GGUFFileTypeMostlyQ4_0_4_4-25]
-	_ = x[GGUFFileTypeMostlyQ4_0_4_8-26]
-	_ = x[GGUFFileTypeMostlyQ4_0_8_8-27]
-	_ = x[GGUFFileTypeMostlyTQ1_0-28]
-	_ = x[GGUFFileTypeMostlyTQ2_0-29]
-	_ = x[GGUFFileTypeMostlyIQ4_NL_4_4-30]
-	_ = x[GGUFFileTypeMostlyIQ4_NL_4_8-31]
-	_ = x[GGUFFileTypeMostlyIQ4_NL_8_8-32]
-	_ = x[_GGUFFileTypeCount-33]
+	_ = x[GGUFFileTypeMostlyQ3_K_S-11]
+	_ = x[GGUFFileTypeMostlyQ3_K_M-12]
+	_ = x[GGUFFileTypeMostlyQ3_K_L-13]
+	_ = x[GGUFFileTypeMostlyQ4_K_S-14]
+	_ = x[GGUFFileTypeMostlyQ4_K_M-15]
+	_ = x[GGUFFileTypeMostlyQ5_K_S-16]
+	_ = x[GGUFFileTypeMostlyQ5_K_M-17]
+	_ = x[GGUFFileTypeMostlyQ6_K-18]
+	_ = x[GGUFFileTypeMostlyIQ2_XXS-19]
+	_ = x[GGUFFileTypeMostlyIQ2_XS-20]
+	_ = x[GGUFFileTypeMostlyQ2_K_S-21]
+	_ = x[GGUFFileTypeMostlyIQ3_XS-22]
+	_ = x[GGUFFileTypeMostlyIQ3_XXS-23]
+	_ = x[GGUFFileTypeMostlyIQ1_S-24]
+	_ = x[GGUFFileTypeMostlyIQ4_NL-25]
+	_ = x[GGUFFileTypeMostlyIQ3_S-26]
+	_ = x[GGUFFileTypeMostlyIQ3_M-27]
+	_ = x[GGUFFileTypeMostlyIQ2_S-28]
+	_ = x[GGUFFileTypeMostlyIQ2_M-29]
+	_ = x[GGUFFileTypeMostlyIQ4_XS-30]
+	_ = x[GGUFFileTypeMostlyIQ1_M-31]
+	_ = x[GGUFFileTypeMostlyBF16-32]
+	_ = x[GGUFFileTypeMostlyQ4_0_4_4-33]
+	_ = x[GGUFFileTypeMostlyQ4_0_4_8-34]
+	_ = x[GGUFFileTypeMostlyQ4_0_8_8-35]
+	_ = x[GGUFFileTypeMostlyTQ1_0-36]
+	_ = x[GGUFFileTypeMostlyTQ2_0-37]
+	_ = x[GGUFFileTypeMostlyMXFP4-38]
+	_ = x[_GGUFFileTypeCount-39]
 }
 
-const _GGUFFileType_name = "F32F16Q4_0Q4_1Q4_1_F16Q4_2Q4_3Q8_0Q5_0Q5_1Q2_KQ3_K/Q3_K_SQ4_K/Q3_K_MQ5_K/Q3_K_LQ6_K/Q4_K_SIQ2_XXS/Q4_K_MIQ2_XS/Q5_K_SIQ3_XXS/Q5_K_MIQ1_S/Q6_KIQ4_NLIQ3_SIQ2_SIQ4_XSIQ1_MBF16Q4_0_4x4Q4_0_4x8Q4_0_8x8TQ1_0TQ2_0IQ4_NL_4x4IQ4_NL_4x8IQ4_NL_8x8Unknown"
+const _GGUFFileType_name = "MOSTLY_F32MOSTLY_F16MOSTLY_Q4_0MOSTLY_Q4_1MOSTLY_Q4_1_SOME_F16MOSTLY_Q4_2MOSTLY_Q4_3MOSTLY_Q8_0MOSTLY_Q5_0MOSTLY_Q5_1MOSTLY_Q2_KMOSTLY_Q3_K_SMOSTLY_Q3_K_MMOSTLY_Q3_K_LMOSTLY_Q4_K_SMOSTLY_Q4_K_MMOSTLY_Q5_K_SMOSTLY_Q5_K_MMOSTLY_Q6_KMOSTLY_IQ2_XXSMOSTLY_IQ2_XSMOSTLY_Q2_K_SMOSTLY_IQ3_XSMOSTLY_IQ3_XXSMOSTLY_IQ1_SMOSTLY_IQ4_NLMOSTLY_IQ3_SMOSTLY_IQ3_MMOSTLY_IQ2_SMOSTLY_IQ2_MMOSTLY_IQ4_XSMOSTLY_IQ1_MMOSTLY_BF16MOSTLY_Q4_0_4_4MOSTLY_Q4_0_4_8MOSTLY_Q4_0_8_8MOSTLY_TQ1_0MOSTLY_TQ2_0MOSTLY_MXFP4Unknown"
 
-var _GGUFFileType_index = [...]uint8{0, 3, 6, 10, 14, 22, 26, 30, 34, 38, 42, 46, 57, 68, 79, 90, 104, 117, 131, 141, 147, 152, 157, 163, 168, 172, 180, 188, 196, 201, 206, 216, 226, 236, 243}
+var _GGUFFileType_index = [...]uint16{0, 10, 20, 31, 42, 62, 73, 84, 95, 106, 117, 128, 141, 154, 167, 180, 193, 206, 219, 230, 244, 257, 270, 283, 297, 309, 322, 334, 346, 358, 370, 383, 395, 406, 421, 436, 451, 463, 475, 487, 494}
 
 func (i GGUFFileType) String() string {
 	if i >= GGUFFileType(len(_GGUFFileType_index)-1) {
diff --git a/vendor/github.com/rs/dnscache/.travis.yml b/vendor/github.com/rs/dnscache/.travis.yml
deleted file mode 100644
index ce47932b..00000000
--- a/vendor/github.com/rs/dnscache/.travis.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-language: go
-go:
-  - "1.8"
-  - "1.9"
-  - "1.10"
-  - "1.11"
-  - "1.12"
-  - tip
-matrix:
-  allow_failures:
-    - go: tip
-script:
-  go test -v -race -cpu=1,2,4 -bench . -benchmem ./...
diff --git a/vendor/github.com/rs/dnscache/LICENSE b/vendor/github.com/rs/dnscache/LICENSE
deleted file mode 100644
index 71abfee3..00000000
--- a/vendor/github.com/rs/dnscache/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2018 Olivier Poitrey
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/vendor/github.com/rs/dnscache/README.md b/vendor/github.com/rs/dnscache/README.md
deleted file mode 100644
index 267c6996..00000000
--- a/vendor/github.com/rs/dnscache/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# DNS Lookup Cache
-
-[![license](http://img.shields.io/badge/license-MIT-red.svg?style=flat)](https://raw.githubusercontent.com/rs/dnscache/master/LICENSE)
-[![Go Report Card](https://goreportcard.com/badge/github.com/rs/dnscache)](https://goreportcard.com/report/github.com/rs/dnscache)
-[![Build Status](https://travis-ci.org/rs/dnscache.svg?branch=master)](https://travis-ci.org/rs/dnscache)
-[![Coverage](http://gocover.io/_badge/github.com/rs/dnscache)](http://gocover.io/github.com/rs/dnscache)
-[![godoc](http://img.shields.io/badge/godoc-reference-blue.svg?style=flat)](https://godoc.org/github.com/rs/dnscache)
-
-The dnscache package provides a DNS cache layer to Go's `net.Resolver`.
-
-# Install
-
-Install using the "go get" command:
-
-```
-go get -u github.com/rs/dnscache
-```
-
-# Usage
-
-Create a new instance and use it in place of `net.Resolver`. New names will be cached. Call the `Refresh` method at regular interval to update cached entries and cleanup unused ones.
-
-```go
-resolver := &dnscache.Resolver{}
-
-// First call will cache the result
-addrs, err := resolver.LookupHost(context.Background(), "example.com")
-
-// Subsequent calls will use the cached result
-addrs, err = resolver.LookupHost(context.Background(), "example.com")
-
-// Call to refresh will refresh names in cache. If you pass true, it will also
-// remove cached names not looked up since the last call to Refresh. It is a good idea
-// to call this method on a regular interval.
-go func() {
-    t := time.NewTicker(5 * time.Minute)
-    defer t.Stop()
-    for range t.C {
-        resolver.Refresh(true)
-    }
-}()
-```
-
-If you are using an `http.Transport`, you can use this cache by specifying a `DialContext` function:
-
-```go
-r := &dnscache.Resolver{}
-t := &http.Transport{
-    DialContext: func(ctx context.Context, network string, addr string) (conn net.Conn, err error) {
-        host, port, err := net.SplitHostPort(addr)
-        if err != nil {
-            return nil, err
-        }
-        ips, err := r.LookupHost(ctx, host)
-        if err != nil {
-            return nil, err
-        }
-        for _, ip := range ips {
-            var dialer net.Dialer
-            conn, err = dialer.DialContext(ctx, network, net.JoinHostPort(ip, port))
-            if err == nil {
-                break
-            }
-        }
-        return
-    },
-}
-```
-
-In addition to the `Refresh` method, you can `RefreshWithOptions`. This method adds an option to persist resource records
-on failed lookups
-```go
-r := &Resolver{}
-options := dnscache.ResolverRefreshOptions{}
-options.ClearUnused = true
-options.PersistOnFailure = false
-resolver.RefreshWithOptions(options)
-```
diff --git a/vendor/github.com/rs/dnscache/dnscache.go b/vendor/github.com/rs/dnscache/dnscache.go
deleted file mode 100644
index ddbb923f..00000000
--- a/vendor/github.com/rs/dnscache/dnscache.go
+++ /dev/null
@@ -1,308 +0,0 @@
-package dnscache
-
-import (
-	"context"
-	"net"
-	"net/http/httptrace"
-	"sync"
-	"time"
-
-	"golang.org/x/sync/singleflight"
-)
-
-type DNSResolver interface {
-	LookupHost(ctx context.Context, host string) (addrs []string, err error)
-	LookupAddr(ctx context.Context, addr string) (names []string, err error)
-}
-
-type Resolver struct {
-	// Timeout defines the maximum allowed time allowed for a lookup.
-	Timeout time.Duration
-
-	// Resolver is used to perform actual DNS lookup. If nil,
-	// net.DefaultResolver is used instead.
-	Resolver DNSResolver
-
-	once  sync.Once
-	mu    sync.RWMutex
-	cache map[string]*cacheEntry
-
-	// OnCacheMiss is executed if the host or address is not included in
-	// the cache and the default lookup is executed.
-	OnCacheMiss func()
-}
-
-type ResolverRefreshOptions struct {
-	ClearUnused      bool
-	PersistOnFailure bool
-}
-
-type cacheEntry struct {
-	rrs  []string
-	err  error
-	used bool
-}
-
-// LookupAddr performs a reverse lookup for the given address, returning a list
-// of names mapping to that address.
-func (r *Resolver) LookupAddr(ctx context.Context, addr string) (names []string, err error) {
-	r.once.Do(r.init)
-	return r.lookup(ctx, "r"+addr)
-}
-
-// LookupHost looks up the given host using the local resolver. It returns a
-// slice of that host's addresses.
-func (r *Resolver) LookupHost(ctx context.Context, host string) (addrs []string, err error) {
-	r.once.Do(r.init)
-	return r.lookup(ctx, "h"+host)
-}
-
-// refreshRecords refreshes cached entries which have been used at least once since
-// the last Refresh. If clearUnused is true, entries which haven't be used since the
-// last Refresh are removed from the cache. If persistOnFailure is true, stale
-// entries will not be removed on failed lookups
-func (r *Resolver) refreshRecords(clearUnused bool, persistOnFailure bool) {
-	r.once.Do(r.init)
-	r.mu.RLock()
-	update := make([]string, 0, len(r.cache))
-	del := make([]string, 0, len(r.cache))
-	for key, entry := range r.cache {
-		if entry.used {
-			update = append(update, key)
-		} else if clearUnused {
-			del = append(del, key)
-		}
-	}
-	r.mu.RUnlock()
-
-	if len(del) > 0 {
-		r.mu.Lock()
-		for _, key := range del {
-			delete(r.cache, key)
-		}
-		r.mu.Unlock()
-	}
-
-	for _, key := range update {
-		r.update(context.Background(), key, false, persistOnFailure)
-	}
-}
-
-func (r *Resolver) Refresh(clearUnused bool) {
-	r.refreshRecords(clearUnused, false)
-}
-
-func (r *Resolver) RefreshWithOptions(options ResolverRefreshOptions) {
-	r.refreshRecords(options.ClearUnused, options.PersistOnFailure)
-}
-
-func (r *Resolver) init() {
-	r.cache = make(map[string]*cacheEntry)
-}
-
-// lookupGroup merges lookup calls together for lookups for the same host. The
-// lookupGroup key is is the LookupIPAddr.host argument.
-var lookupGroup singleflight.Group
-
-func (r *Resolver) lookup(ctx context.Context, key string) (rrs []string, err error) {
-	var found bool
-	rrs, err, found = r.load(key)
-	if !found {
-		if r.OnCacheMiss != nil {
-			r.OnCacheMiss()
-		}
-		rrs, err = r.update(ctx, key, true, false)
-	}
-	return
-}
-
-func (r *Resolver) update(ctx context.Context, key string, used bool, persistOnFailure bool) (rrs []string, err error) {
-	c := lookupGroup.DoChan(key, r.lookupFunc(ctx, key))
-	select {
-	case <-ctx.Done():
-		err = ctx.Err()
-		if err == context.DeadlineExceeded {
-			// If DNS request timed out for some reason, force future
-			// request to start the DNS lookup again rather than waiting
-			// for the current lookup to complete.
-			lookupGroup.Forget(key)
-		}
-	case res := <-c:
-		if res.Shared {
-			// We had concurrent lookups, check if the cache is already updated
-			// by a friend.
-			var found bool
-			rrs, err, found = r.load(key)
-			if found {
-				return
-			}
-		}
-		err = res.Err
-		if err == nil {
-			rrs, _ = res.Val.([]string)
-		}
-
-		if err != nil && persistOnFailure {
-			var found bool
-			rrs, err, found = r.load(key)
-			if found {
-				return
-			}
-		}
-
-		r.mu.Lock()
-		r.storeLocked(key, rrs, used, err)
-		r.mu.Unlock()
-	}
-	return
-}
-
-// lookupFunc returns lookup function for key. The type of the key is stored as
-// the first char and the lookup subject is the rest of the key.
-func (r *Resolver) lookupFunc(ctx context.Context, key string) func() (interface{}, error) {
-	if len(key) == 0 {
-		panic("lookupFunc with empty key")
-	}
-
-	var resolver DNSResolver = defaultResolver
-	if r.Resolver != nil {
-		resolver = r.Resolver
-	}
-
-	switch key[0] {
-	case 'h':
-		return func() (interface{}, error) {
-			ctx, cancel := r.prepareCtx(ctx)
-			defer cancel()
-
-			return resolver.LookupHost(ctx, key[1:])
-		}
-	case 'r':
-		return func() (interface{}, error) {
-			ctx, cancel := r.prepareCtx(ctx)
-			defer cancel()
-
-			return resolver.LookupAddr(ctx, key[1:])
-		}
-	default:
-		panic("lookupFunc invalid key type: " + key)
-	}
-}
-
-func (r *Resolver) prepareCtx(origContext context.Context) (ctx context.Context, cancel context.CancelFunc) {
-	ctx = context.Background()
-	if r.Timeout > 0 {
-		ctx, cancel = context.WithTimeout(ctx, r.Timeout)
-	} else {
-		cancel = func() {}
-	}
-
-	// If a httptrace has been attached to the given context it will be copied over to the newly created context. We only need to copy pointers
-	// to DNSStart and DNSDone hooks
-	if trace := httptrace.ContextClientTrace(origContext); trace != nil {
-		derivedTrace := &httptrace.ClientTrace{
-			DNSStart: trace.DNSStart,
-			DNSDone:  trace.DNSDone,
-		}
-
-		ctx = httptrace.WithClientTrace(ctx, derivedTrace)
-	}
-
-	return
-}
-
-func (r *Resolver) load(key string) (rrs []string, err error, found bool) {
-	r.mu.RLock()
-	var entry *cacheEntry
-	entry, found = r.cache[key]
-	if !found {
-		r.mu.RUnlock()
-		return
-	}
-	rrs = entry.rrs
-	err = entry.err
-	used := entry.used
-	r.mu.RUnlock()
-	if !used {
-		r.mu.Lock()
-		entry.used = true
-		r.mu.Unlock()
-	}
-	return rrs, err, true
-}
-
-func (r *Resolver) storeLocked(key string, rrs []string, used bool, err error) {
-	if entry, found := r.cache[key]; found {
-		// Update existing entry in place
-		entry.rrs = rrs
-		entry.err = err
-		entry.used = used
-		return
-	}
-	r.cache[key] = &cacheEntry{
-		rrs:  rrs,
-		err:  err,
-		used: used,
-	}
-}
-
-var defaultResolver = &defaultResolverWithTrace{
-	ipVersion: "ip",
-}
-
-// Create a new resolver that only resolves to IPv4 Addresses when looking up Hosts.
-// Example:
-//
-//	resolver := dnscache.Resolver{
-//	  Resolver: NewResolverOnlyV4(),
-//	}
-func NewResolverOnlyV4() DNSResolver {
-	return &defaultResolverWithTrace{
-		ipVersion: "ip4",
-	}
-}
-
-// Create a new resolver that only resolves to IPv6 Addresses when looking up Hosts.
-// Example:
-//
-//	resolver := dnscache.Resolver{
-//	  Resolver: NewResolverOnlyV6(),
-//	}
-func NewResolverOnlyV6() DNSResolver {
-	return &defaultResolverWithTrace{
-		ipVersion: "ip6",
-	}
-}
-
-// defaultResolverWithTrace calls `LookupIP` instead of `LookupHost` on `net.DefaultResolver` in order to cause invocation of the `DNSStart`
-// and `DNSDone` hooks. By implementing `DNSResolver`, backward compatibility can be ensured.
-type defaultResolverWithTrace struct {
-	ipVersion string
-}
-
-func (d *defaultResolverWithTrace) LookupHost(ctx context.Context, host string) (addrs []string, err error) {
-	ipVersion := d.ipVersion
-	if ipVersion != "ip" && ipVersion != "ip4" && ipVersion != "ip6" {
-		ipVersion = "ip"
-	}
-
-	// `net.Resolver#LookupHost` does not cause invocation of `net.Resolver#lookupIPAddr`, therefore the `DNSStart` and `DNSDone` tracing hooks
-	// built into the stdlib are never called. `LookupIP`, despite it's name, can also be used to lookup a hostname but does cause these hooks to be
-	// triggered. The format of the reponse is different, therefore it needs this thin wrapper converting it.
-	rawIPs, err := net.DefaultResolver.LookupIP(ctx, ipVersion, host)
-	if err != nil {
-		return nil, err
-	}
-
-	cookedIPs := make([]string, len(rawIPs))
-
-	for i, v := range rawIPs {
-		cookedIPs[i] = v.String()
-	}
-
-	return cookedIPs, nil
-}
-
-func (d *defaultResolverWithTrace) LookupAddr(ctx context.Context, addr string) (names []string, err error) {
-	return net.DefaultResolver.LookupAddr(ctx, addr)
-}
diff --git a/vendor/golang.org/x/sync/singleflight/singleflight.go b/vendor/golang.org/x/sync/singleflight/singleflight.go
deleted file mode 100644
index 40518309..00000000
--- a/vendor/golang.org/x/sync/singleflight/singleflight.go
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package singleflight provides a duplicate function call suppression
-// mechanism.
-package singleflight // import "golang.org/x/sync/singleflight"
-
-import (
-	"bytes"
-	"errors"
-	"fmt"
-	"runtime"
-	"runtime/debug"
-	"sync"
-)
-
-// errGoexit indicates the runtime.Goexit was called in
-// the user given function.
-var errGoexit = errors.New("runtime.Goexit was called")
-
-// A panicError is an arbitrary value recovered from a panic
-// with the stack trace during the execution of given function.
-type panicError struct {
-	value interface{}
-	stack []byte
-}
-
-// Error implements error interface.
-func (p *panicError) Error() string {
-	return fmt.Sprintf("%v\n\n%s", p.value, p.stack)
-}
-
-func (p *panicError) Unwrap() error {
-	err, ok := p.value.(error)
-	if !ok {
-		return nil
-	}
-
-	return err
-}
-
-func newPanicError(v interface{}) error {
-	stack := debug.Stack()
-
-	// The first line of the stack trace is of the form "goroutine N [status]:"
-	// but by the time the panic reaches Do the goroutine may no longer exist
-	// and its status will have changed. Trim out the misleading line.
-	if line := bytes.IndexByte(stack[:], '\n'); line >= 0 {
-		stack = stack[line+1:]
-	}
-	return &panicError{value: v, stack: stack}
-}
-
-// call is an in-flight or completed singleflight.Do call
-type call struct {
-	wg sync.WaitGroup
-
-	// These fields are written once before the WaitGroup is done
-	// and are only read after the WaitGroup is done.
-	val interface{}
-	err error
-
-	// These fields are read and written with the singleflight
-	// mutex held before the WaitGroup is done, and are read but
-	// not written after the WaitGroup is done.
-	dups  int
-	chans []chan<- Result
-}
-
-// Group represents a class of work and forms a namespace in
-// which units of work can be executed with duplicate suppression.
-type Group struct {
-	mu sync.Mutex       // protects m
-	m  map[string]*call // lazily initialized
-}
-
-// Result holds the results of Do, so they can be passed
-// on a channel.
-type Result struct {
-	Val    interface{}
-	Err    error
-	Shared bool
-}
-
-// Do executes and returns the results of the given function, making
-// sure that only one execution is in-flight for a given key at a
-// time. If a duplicate comes in, the duplicate caller waits for the
-// original to complete and receives the same results.
-// The return value shared indicates whether v was given to multiple callers.
-func (g *Group) Do(key string, fn func() (interface{}, error)) (v interface{}, err error, shared bool) {
-	g.mu.Lock()
-	if g.m == nil {
-		g.m = make(map[string]*call)
-	}
-	if c, ok := g.m[key]; ok {
-		c.dups++
-		g.mu.Unlock()
-		c.wg.Wait()
-
-		if e, ok := c.err.(*panicError); ok {
-			panic(e)
-		} else if c.err == errGoexit {
-			runtime.Goexit()
-		}
-		return c.val, c.err, true
-	}
-	c := new(call)
-	c.wg.Add(1)
-	g.m[key] = c
-	g.mu.Unlock()
-
-	g.doCall(c, key, fn)
-	return c.val, c.err, c.dups > 0
-}
-
-// DoChan is like Do but returns a channel that will receive the
-// results when they are ready.
-//
-// The returned channel will not be closed.
-func (g *Group) DoChan(key string, fn func() (interface{}, error)) <-chan Result {
-	ch := make(chan Result, 1)
-	g.mu.Lock()
-	if g.m == nil {
-		g.m = make(map[string]*call)
-	}
-	if c, ok := g.m[key]; ok {
-		c.dups++
-		c.chans = append(c.chans, ch)
-		g.mu.Unlock()
-		return ch
-	}
-	c := &call{chans: []chan<- Result{ch}}
-	c.wg.Add(1)
-	g.m[key] = c
-	g.mu.Unlock()
-
-	go g.doCall(c, key, fn)
-
-	return ch
-}
-
-// doCall handles the single call for a key.
-func (g *Group) doCall(c *call, key string, fn func() (interface{}, error)) {
-	normalReturn := false
-	recovered := false
-
-	// use double-defer to distinguish panic from runtime.Goexit,
-	// more details see https://golang.org/cl/134395
-	defer func() {
-		// the given function invoked runtime.Goexit
-		if !normalReturn && !recovered {
-			c.err = errGoexit
-		}
-
-		g.mu.Lock()
-		defer g.mu.Unlock()
-		c.wg.Done()
-		if g.m[key] == c {
-			delete(g.m, key)
-		}
-
-		if e, ok := c.err.(*panicError); ok {
-			// In order to prevent the waiting channels from being blocked forever,
-			// needs to ensure that this panic cannot be recovered.
-			if len(c.chans) > 0 {
-				go panic(e)
-				select {} // Keep this goroutine around so that it will appear in the crash dump.
-			} else {
-				panic(e)
-			}
-		} else if c.err == errGoexit {
-			// Already in the process of goexit, no need to call again
-		} else {
-			// Normal return
-			for _, ch := range c.chans {
-				ch <- Result{c.val, c.err, c.dups > 0}
-			}
-		}
-	}()
-
-	func() {
-		defer func() {
-			if !normalReturn {
-				// Ideally, we would wait to take a stack trace until we've determined
-				// whether this is a panic or a runtime.Goexit.
-				//
-				// Unfortunately, the only way we can distinguish the two is to see
-				// whether the recover stopped the goroutine from terminating, and by
-				// the time we know that, the part of the stack trace relevant to the
-				// panic has been discarded.
-				if r := recover(); r != nil {
-					c.err = newPanicError(r)
-				}
-			}
-		}()
-
-		c.val, c.err = fn()
-		normalReturn = true
-	}()
-
-	if !normalReturn {
-		recovered = true
-	}
-}
-
-// Forget tells the singleflight to forget about a key.  Future calls
-// to Do for this key will call the function rather than waiting for
-// an earlier call to complete.
-func (g *Group) Forget(key string) {
-	g.mu.Lock()
-	delete(g.m, key)
-	g.mu.Unlock()
-}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index f05f1c38..23ea2b91 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -144,8 +144,8 @@ github.com/docker/go-connections/tlsconfig
 # github.com/docker/go-units v0.5.0
 ## explicit
 github.com/docker/go-units
-# github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058
-## explicit; go 1.23.0
+# github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57
+## explicit; go 1.24
 github.com/docker/model-distribution/builder
 github.com/docker/model-distribution/distribution
 github.com/docker/model-distribution/internal/bundle
@@ -246,7 +246,7 @@ github.com/google/go-containerregistry/pkg/v1/types
 github.com/google/uuid
 # github.com/gorilla/mux v1.8.1
 ## explicit; go 1.20
-# github.com/gpustack/gguf-parser-go v0.14.1
+# github.com/gpustack/gguf-parser-go v0.22.1
 ## explicit; go 1.22.0
 github.com/gpustack/gguf-parser-go
 github.com/gpustack/gguf-parser-go/util/anyx
@@ -396,9 +396,6 @@ github.com/prometheus/procfs/internal/util
 # github.com/rivo/uniseg v0.4.7
 ## explicit; go 1.18
 github.com/rivo/uniseg
-# github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529
-## explicit; go 1.12
-github.com/rs/dnscache
 # github.com/russross/blackfriday/v2 v2.1.0
 ## explicit
 github.com/russross/blackfriday/v2
@@ -536,7 +533,6 @@ golang.org/x/net/trace
 ## explicit; go 1.23.0
 golang.org/x/sync/errgroup
 golang.org/x/sync/semaphore
-golang.org/x/sync/singleflight
 # golang.org/x/sys v0.35.0
 ## explicit; go 1.23.0
 golang.org/x/sys/cpu