大数据组件 Flume-Kafka-HDFS的串联连接操作

win10端获取数据来源:

package com.atguigu.KafkaToHdfs;


import java.io.*;
import java.util.Scanner;


public class IOProducer {
    public static void main(String[] args) throws IOException {


        Scanner sc=new Scanner(System.in);


        FileOutputStream fos = new FileOutputStream(new File("E:/Share-Virtual/file.txt"),true);


        String line;


        while(true){
            System.out.println("请输入消息:");
            line=sc.nextLine();
            fos.write((line+"\n").getBytes());
            fos.flush();
        }


    }
}

Flume配置信息

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /opt/module/flume/position/tail_dir3.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /mnt/hgfs/Share-Virtual/file.txt

# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers =hadoop102:9092,hadoop103:9092,hadoop104:9092
a1.sinks.k1.kafka.topic = first
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

Kafka消费者端:

package com.atguigu.KafkaToHdfs;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.connect.connector.Connector;


import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Properties;


public class MyConsumer {
    private static String KafkaHost;
    private static String KafkaGroup;
    private static String KafkaTopic;
    private static String HdfsURI;
    private static String HdfsDir;
    private static String hadoopUser;


    private static Connector connect;


    private static Configuration hdfsConf;
    private static FileSystem hadoopFS;


    public static void main(String[] args) throws IOException, InterruptedException {
        hadoopUser = "hadoop";

        init();

        System.out.println("开始启动服务...");


        hdfsConf = new Configuration();


        hadoopFS = FileSystem.get(URI.create(HdfsURI), hdfsConf, hadoopUser);


        if (!hadoopFS.exists(new Path("/" + HdfsDir))) {
            hadoopFS.mkdirs(new Path("/" + HdfsDir));
        }


        System.out.println("服务启动完毕,监听执行中");


        run();
    }


    public static void run() {


        Properties properties = new Properties();


        properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, KafkaHost);


        properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");


        properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "100");


        properties.put(ConsumerConfig.GROUP_ID_CONFIG, KafkaGroup);


        properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");


        KafkaConsumer<String, String> consumer = new KafkaConsumer<String, String>(properties);


        consumer.subscribe(Arrays.asList(KafkaTopic));


        while (true) {
            ConsumerRecords<String, String> records = consumer.poll(100);


            for (ConsumerRecord record : records) {
                String s = record.value().toString();


                System.out.println(s);


                ByteArrayInputStream fis = new ByteArrayInputStream((s+"\n").getBytes());


                FSDataOutputStream fos = null;
                try {
                    if (!hadoopFS.exists(new Path("/" + HdfsDir + "/tmp1.txt"))) {
                        fos = hadoopFS.create(new Path("/" + HdfsDir + "/tmp1.txt"), false);
                    } else {
                        fos = hadoopFS.append(new Path("/" + HdfsDir + "/tmp1.txt"));
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }


                try {
                    IOUtils.copyBytes(fis, fos, hdfsConf);
                } catch (IOException e) {
                    e.printStackTrace();
                }


                IOUtils.closeStream(fos);
                IOUtils.closeStream(fis);
            }
        }


    }


    private static void init() {
        KafkaHost = "hadoop102:9092";
        KafkaGroup = "test";
        KafkaTopic = "first";
        HdfsURI = "hdfs://hadoop102:9000";
        HdfsDir = "kafka-hdfs";
    }
}

大数据组件 Flume-Kafka-HDFS的串联连接操作

上一篇:文件的上传和下载


下一篇:php操作memcache缓存