config.json 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. {
  2. "architectures": [
  3. "DotsOCRForCausalLM"
  4. ],
  5. "model_type": "dots_ocr",
  6. "auto_map": {
  7. "AutoConfig": "configuration_dots.DotsOCRConfig",
  8. "AutoModelForCausalLM": "modeling_dots_ocr.DotsOCRForCausalLM"
  9. },
  10. "attention_bias": true,
  11. "attention_dropout": 0.0,
  12. "hidden_act": "silu",
  13. "hidden_size": 1536,
  14. "initializer_range": 0.02,
  15. "intermediate_size": 8960,
  16. "max_position_embeddings": 131072,
  17. "max_window_layers": 28,
  18. "num_attention_heads": 12,
  19. "num_hidden_layers": 28,
  20. "num_key_value_heads": 2,
  21. "rms_norm_eps": 1e-06,
  22. "rope_scaling": null,
  23. "rope_theta": 1000000,
  24. "sliding_window": 131072,
  25. "tie_word_embeddings": false,
  26. "torch_dtype": "bfloat16",
  27. "transformers_version": "4.51.0",
  28. "use_cache": true,
  29. "use_sliding_window": false,
  30. "vocab_size": 151936,
  31. "image_token_id": 151665,
  32. "video_token_id": 151656,
  33. "vision_config": {
  34. "embed_dim": 1536,
  35. "hidden_size": 1536,
  36. "intermediate_size": 4224,
  37. "num_hidden_layers": 42,
  38. "num_attention_heads": 12,
  39. "num_channels": 3,
  40. "patch_size": 14,
  41. "post_norm": true,
  42. "rms_norm_eps": 1e-05,
  43. "spatial_merge_size": 2,
  44. "temporal_patch_size": 1,
  45. "use_bias": false,
  46. "attn_implementation": "sdpa",
  47. "init_merger_std": 0.02,
  48. "initializer_range": 0.02,
  49. "is_causal": false
  50. }
  51. }