首先要建立資料庫
可以視作將Athena關聯到某個S3儲存桶
第一次可以從提醒這裡點進去
之後要修改的話點擊「設定」分頁即可
輸入S3儲存桶的位置
再來就能建立一個資料庫
以上幾步前置作業就完成了
再來需要建立資料表
建立全部(掃比較慢也比較貴)
CREATE EXTERNAL TABLE IF NOT EXISTS alb_access_logs (
type string,
time string,
elb string,
client_ip string,
client_port int,
target_ip string,
target_port int,
request_processing_time double,
target_processing_time double,
response_processing_time double,
elb_status_code int,
target_status_code string,
received_bytes bigint,
sent_bytes bigint,
request_verb string,
request_url string,
request_proto string,
user_agent string,
ssl_cipher string,
ssl_protocol string,
target_group_arn string,
trace_id string,
domain_name string,
chosen_cert_arn string,
matched_rule_priority string,
request_creation_time string,
actions_executed string,
redirect_url string,
lambda_error_reason string,
target_port_list string,
target_status_code_list string,
classification string,
classification_reason string,
conn_trace_id string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' =
'([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) (.*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^ ]*)\" \"([^\s]+?)\" \"([^\s]+)\" \"([^ ]*)\" \"([^ ]*)\" ?([^ ]*)?( .*)?')
LOCATION 's3://DOC-EXAMPLE-BUCKET/access-log-folder-path/'
底下紅框這裡要記得改成自己的S3儲存桶路徑
建立完成之後可以看到資料表們
再來可以自由輸入搜尋語句
以下是搜尋HTTP狀態碼≥500並列出用戶ip為例
SELECT client_ip, elb_status_code, COUNT(*) AS request_count
FROM <資料庫名>.<資料表名>
WHERE elb_status_code >= 500
GROUP BY client_ip, elb_status_code
ORDER BY request_count DESC;
這是查詢遍歷了所~有~資料之後的結果,高達26.64G
查詢優化
特定日期(掃比較快也比較節費)
是在建立資料表的時候也建立day欄位
CREATE EXTERNAL TABLE IF NOT EXISTS alb_access_logs (
type string,
time string,
elb string,
client_ip string,
client_port int,
target_ip string,
target_port int,
request_processing_time double,
target_processing_time double,
response_processing_time double,
elb_status_code int,
target_status_code string,
received_bytes bigint,
sent_bytes bigint,
request_verb string,
request_url string,
request_proto string,
user_agent string,
ssl_cipher string,
ssl_protocol string,
target_group_arn string,
trace_id string,
domain_name string,
chosen_cert_arn string,
matched_rule_priority string,
request_creation_time string,
actions_executed string,
redirect_url string,
lambda_error_reason string,
target_port_list string,
target_status_code_list string,
classification string,
classification_reason string,
conn_trace_id string
)
PARTITIONED BY
(
day STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' =
'([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) (.*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^ ]*)\" \"([^\s]+?)\" \"([^\s]+)\" \"([^ ]*)\" \"([^ ]*)\" ?([^ ]*)?( .*)?')
LOCATION 's3://DOC-EXAMPLE-BUCKET/AWSLogs/<ACCOUNT-NUMBER>/elasticloadbalancing/<REGION>/'
TBLPROPERTIES
(
"projection.enabled" = "true",
"projection.day.type" = "date",
"projection.day.range" = "2022/01/01,NOW",
"projection.day.format" = "yyyy/MM/dd",
"projection.day.interval" = "1",
"projection.day.interval.unit" = "DAYS",
"storage.location.template" = "s3://DOC-EXAMPLE-BUCKET/AWSLogs/<ACCOUNT-NUMBER>/elasticloadbalancing/<REGION>/${day}"
)
再來通過類似的查詢語句
SELECT client_ip, elb_status_code, time
FROM <資料表名>
WHERE day = '2024/07/23'
AND elb_status_code > 500
AND parse_datetime(time, 'yyyy-MM-dd''T''HH:mm:ss.SSSSSS''Z')
BETWEEN parse_datetime('2024-07-23-09:00:00', 'yyyy-MM-dd-HH:mm:ss')
AND parse_datetime('2024-07-23-10:00:00', 'yyyy-MM-dd-HH:mm:ss');
得出來的結果就快了不少,而且只掃了101MB的範圍