Skip to content

Commit a73bbd5

Browse files
authored
mtmd: refactor image preprocessing (#21031)
* mtmd: refactor image pre-processing * correct some places * correct lfm2 * fix deepseek-ocr on server * add comment to clarify about mtmd_image_preprocessor_dyn_size
1 parent ded446b commit a73bbd5

8 files changed

Lines changed: 1604 additions & 1541 deletions

File tree

tools/mtmd/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
55
add_library(mtmd
66
mtmd.cpp
77
mtmd-audio.cpp
8+
mtmd-image.cpp
89
mtmd.h
910
mtmd-helper.cpp
1011
mtmd-helper.h

tools/mtmd/clip-impl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@
5151

5252
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
5353
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
54-
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
5554
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
5655
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
5756
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"

tools/mtmd/clip-model.h

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ enum patch_merge_type {
2828
PATCH_MERGE_SPATIAL_UNPAD,
2929
};
3030

31+
enum resize_algo {
32+
RESIZE_ALGO_BILINEAR, // stretch to target resolution
33+
RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
34+
RESIZE_ALGO_BICUBIC_PILLOW,
35+
// RESIZE_ALGO_LANCZOS, // TODO
36+
};
37+
3138
struct clip_hparams {
3239
int32_t image_size = 0;
3340
int32_t patch_size = 0;
@@ -37,13 +44,26 @@ struct clip_hparams {
3744
int32_t n_head = 0;
3845
int32_t n_layer = 0;
3946
// idefics3
47+
int32_t n_merge = 0; // number of patch merges **per-side**
48+
49+
// for preprocessor
4050
int32_t image_longest_edge = 0;
4151
int32_t image_min_pixels = -1;
4252
int32_t image_max_pixels = -1;
43-
int32_t n_merge = 0; // number of patch merges **per-side**
53+
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
54+
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
55+
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
4456

57+
// (preprocessor) for llava-uhd style models
58+
std::vector<clip_image_size> image_res_candidates;
4559
int32_t preproc_min_tiles = 0;
4660
int32_t preproc_max_tiles = 0;
61+
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
62+
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
63+
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
64+
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
65+
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
66+
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
4767

4868
float image_mean[3];
4969
float image_std[3];
@@ -60,8 +80,6 @@ struct clip_hparams {
6080
float eps = 1e-6;
6181
float rope_theta = 0.0;
6282

63-
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
64-
int32_t image_crop_resolution;
6583
std::unordered_set<int32_t> vision_feature_layer;
6684
int32_t attn_window_size = 0;
6785
int32_t n_wa_pattern = 0;

tools/mtmd/clip.cpp

Lines changed: 52 additions & 1398 deletions
Large diffs are not rendered by default.

tools/mtmd/clip.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
9797
*/
9898
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
9999

100-
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
101-
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
102-
103100
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
104101

105102
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);

0 commit comments

Comments
 (0)