简单的Flume和hive的结合

2023-01-01 19:51:24

1. 日志格式　

#Software: Microsoft Internet Information Services 6.0

#Version: 1.0

#Date: -- ::

#Fields: date time s-sitename s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status

-- :: W3SVC1 :da8:::: GET /skin6/film_sort.asp id=  - :da8::f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko

-- :: W3SVC1 :da8:::: GET /news.asp -  - :da8::f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko

-- :: W3SVC1 :da8:::: GET /UploadFile/.jpg -  - :da8::f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko

2. 建立的对应的hive表：

CREATE EXTERNAL  TABLE IF NOT EXISTS exmovielog

(

    log_date            TIMESTAMP,

    s_sitename            STRING,

    s_ip                STRING,

    cs_method            STRING,

    cs_uri_stem            STRING,

    cs_uri_query            STRING,

    s_port                INT,

    cs_username            STRING,

    c_ip                STRING,

    user_agen            STRING,

    sc_status            INT,

    sc_substatus            INT,

    sc_win32_status            INT

)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '`'

LOCATION '/movielog';

3. flume配置文件

agent1.sources = source1

agent1.channels = channel1

agent1.sinks = sink1

# Each channel's type is defined.

agent1.channels.channel1.type = file

agent1.channels.channel1.checkpointDir = /home/hadoop_admin/flumeTemp/fchannel/spool/checkpoint

agent1.channels.channel1.dataDirs = /home/hadoop_admin/flumeTemp/fchannel/spool/data

agent1.channels.channel1.capacity = 

# For each one of the sources, the type is defined

agent1.sources.source1.type = spooldir

agent1.sources.source1.inputCharset = GBK

agent1.sources.source1.spoolDir =/home/hadoop_admin/movielog

agent1.sources.source1.fileHeader = true

agent1.sources.source1.deletePolicy = immediate

agent1.sources.source1.batchSize =

agent1.sources.source1.channels = channel1

# remove the line that starts with '#'

agent1.sources.source1.interceptors = i1  search-replace1 search-replace2 search-replace3

agent1.sources.source1.interceptors.i1.type = regex_filter

agent1.sources.source1.interceptors.i1.regex = ^[^#].*$

# the default value of this configuration is flase

# agent1.sources.source1.interceptors.i1.excludeEvents = true

# agent1.sources.source1.interceptors.i1.regex = ^#

# connect the date and time to be a timestamp

agent1.sources.source1.interceptors.search-replace1.type = search_replace

agent1.sources.source1.interceptors.search-replace1.searchPattern = (\\d\\d\\d\\d-\\d\\d-\\d\\d)\\s(\\d\\d:\\d\\d:\\d\\d)

agent1.sources.source1.interceptors.search-replace1.replaceString = $1T$

# change the split char

agent1.sources.source1.interceptors.search-replace2.type = search_replace

agent1.sources.source1.interceptors.search-replace2.searchPattern = \\s

agent1.sources.source1.interceptors.search-replace2.replaceString = `

agent1.sources.source1.interceptors.search-replace3.type = search_replace

agent1.sources.source1.interceptors.search-replace3.searchPattern = (\\d\\d\\d\\d-\\d\\d-\\d\\d)T(\\d\\d:\\d\\d:\\d\\d)

agent1.sources.source1.interceptors.search-replace3.replaceString = $ $

# Each sink's type must be defined

agent1.sinks.sink1.type = hdfs

agent1.sinks.sink1.channel = channel1

agent1.sinks.sink1.hdfs.path = hdfs://master:9000/movielog

agent1.sinks.sink1.hdfs.writeFormat = Text

agent1.sinks.sink1.hdfs.fileType = DataStream

agent1.sinks.sink1.hdfs.rollInterval =

agent1.sinks.sink1.hdfs.idleTimeout =

agent1.sinks.sink1.hdfs.rollCount =

agent1.sinks.sink1.hdfs.rollSize =

agent1.sinks.sink1.hdfs.batchSize =

agent1.sinks.sink1.hdfs.callTimeout =

码农公寓

相关文章