elasticsearch的mapping

文章发布较早，内容可能过时，阅读注意甄别。

以往采集日志都是比较简单的操作，没有过多深入 es 的 mapping 等内容，不过有时候技能都是基于需求驱动的。

现有日志内容如下：

{
  "sign": "test-log",
  "@timestamp": "2020-07-05T17:43:12+08:00",
  "type": "filter",
  "sale_id": 2084090132,
  "sale_uri": "2003261352dvxv50",
  "shop_id": 47516579,
  "shop_uri": "1910201845lawpvt",
  "cat_id": 4,
  "sec_cat_id": 4001,
  "rule": "startprice",
  "description": "拍品起拍价\u003e0",
  "score": 0,
  "arguments": "{\"startPrice\":2600}"
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

因为后期会对日志中一些内容进行聚合计算，因此要求日志中 score 字段写入之后是 float 类型，但是如果什么都不指定，那么默认写入之后，会分配一个其他的类型。

两种解决方式。

# 第一：创建索引的时候指定 mapping

PUT test-index
{
    "mappings" : {
        "properties" : {
            "score" : {
                "type" : "float"
            }
        }
    }
}

1
2
3
4
5
6
7
8
9
10

返回结果：

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "test-indexa"
}

1
2
3
4
5

查看索引 mapping：

GET test-index/_mapping

返回结果

{
  "test-index": {
    "mappings": {
      "properties": {
        "score": {
          "type": "float"
        }
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11

这样写进来之后对应的 score 字段就是 float 类型了。

但是这样有一个问题，因为刚刚是指定了单个索引的 mapping，正常情况下，我们的日志索引都会按天来存，那么新的索引就无法自动进行对照了。接下来要引入索引模板的配置定义。

PUT _template/template_test
{
  "index_patterns": ["test*"],
  "order" : 1,
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas" : 2
    },
    "mappings" : {
      "properties" : {
        "score" : {
          "type" : "float"
        }
      }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

创建一个索引模板，只要是以 test 开头的索引，那么创建索引并写入进来之后，对应的 score 字段就应该是 float 类型了。

GET test-index-2020-03-30/_mapping

{
  "test-index-2020-03-30" : {
    "mappings" : {
      "properties" : {
        "@timestamp" : {
          "type" : "date"
        },
        "@version" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "arguments" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "batch" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "cat_id" : {
          "type" : "long"
        },
        "description" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "host" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "path" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "rule" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "sale_id" : {
          "type" : "long"
        },
        "sale_uri" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "score" : {
          "type" : "float"
        },
        "sec_cat_id" : {
          "type" : "long"
        },
        "shop_id" : {
          "type" : "long"
        },
        "shop_uri" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "sign" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "type" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        }
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

# 2，logstash 处理。

还有一种相对简便的方案是在 lgostash 层面来做，让日志在从 logstash 转过来的时候，指定某些字段的类型，配置如下：

input {
  kafka {
    bootstrap_servers => "192.168.0.1:9092"
    group_id => "test-index"
    consumer_threads => 6
    topics => ["test-index"]
    client_id => "test-index"
    codec => "json"
    check_crcs => "false"
  }
}

filter {
   mutate {
      convert    => {
         "score" => "float"
      }
   }
}

output {
  elasticsearch {
    hosts => ["http://192.168.0.2:9208"]
    index => "test-index-%{+YYYY-MM-dd-HH}"
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

实际生产中，也会利用这一功能，对 NGINX 的 access 日志进行一些特殊处理：

input {
  kafka {
      bootstrap_servers  => "192.168.0.1:9092"
      group_id          => "nginx_access"
      consumer_threads => 6
      topics            => "nginx_access"
      codec             => "json"
   }
}
filter {
   mutate {
      split           => ["request_uri" , "?"]
      add_field       => {
          "uri_path"  => "%{request_uri[0]}"
          "uri_query" => "%{request_uri[1]}"
      }
      remove_field    => ["request_uri"]
      convert         => {
         "response"               => "integer"
         "body_bytes_sent"        => "integer"
         "request_time"           => "float"
         "upstream_response_time" => "float"
      }
   }
}
output {
   elasticsearch {
      hosts => ["http://192.168.0.2:9208"]
      index => "nginx_access-%{+YYYY.MM.dd}-1"
   }
}