Flume 小记(三) HttpSource 存入本地_flume file roll sink 自定义文件名-CSDN博客

本文链接：https://blog.csdn.net/u010745505/article/details/53514108

本文介绍了如何使用 Flume 的 HttpSource 结合 FileRollSink 将数据存储到本地。虽然 FileRollSink 文件命名和滚动有限制，但通过自定义配置，实现了按日期和时间间隔创建文件夹和文件。配置中包含了 Timestamp 和 Host Interceptors，以及一个自定义 Interceptor 用于添加特定格式的时间戳到 header，以便于文件命名。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

HttpSource -> File Roll Sink

FileRollSink 可以将source中的数据存如本地，但FileRollSink文件名不能自定义，而且不能定时滚动文件，只能按时间间隔滚动，可以自己定义sink，来做定时写文件，参考了 http://blog.csdn.net/tswisdom/article/details/41483471 这篇文章。

本文利用文件Sink，实现了按固定格式目录输出，收集完成结果存放需要按天生成文件夹，按每5分钟生成文件，比如2012年12月29日12点26分的日志，需要放到/data/flume_test/20121229/log-1225-对应的文件中。

配置文件

a1.sources=r1
a1.sinks=k1
a1.channels=c1
 
a1.sources.r1.type=http
a1.sources.r1.bind=localhost
a1.sources.r1.port=50000
a1.sources.r1.channels=c1
a1.sources.r1.handler=com.test.flume.PlainJSONHandler
a1.sources.r1.interceptors = i1 i2 logformat
 
a1.sources.r1.interceptors.i1.preserveExisting = true
a1.sources.r1.interceptors.i1.type = timestamp
a1.sources.r1.interceptors.i2.type = host
a1.sources.r1.interceptors.i2.hostHeader = hostname
a1.sources.r1.interceptors.logformat.type = com.test.flume.LogFormatInterceptor$Builder
 
a1.sinks.k1.channel=c1
a1.sinks.k1.type=com.test.flume.FileSink
#a1.sinks.k1.sink.directory=/data/flume_test2/
#a1.sinks.k1.sink.rollInterval = 3600
a1.sinks.k1.sink.batchSize=100
a1.sinks.k1.file.path = /data/flume_test2/%{dayStr}
a1.sinks.k1.file.filePrefix = log-%{hourStr}%{minStr}-
a1.sinks.k1.file.txnEventMax = 10000
a1.sinks.k1.file.maxOpenFiles = 5
 
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100

注：拦截器 Interceptors 设置在Source 和Source 写入数据的Channel 之间，Source 接收到的事件在写入到对应的Channel之前，拦截器都可以转换或者删除这些事件，这里使用了三个 interceptor : i1， i2， logformat，

(1) i1 是一个Timestamp Interceptor，会在在event 的 header 里添加了 timestamp， preserveExisting 的默认值是false，其作用是如果timestamp已经存在，应该被保护；

(2) i2 是一个 Host Interceptor，会在在event 的 header 里添加了 host， hostHeader 的默认值是host，是header的key名称， Host Interceptor( i2 ) 还有个属性为 useIP 默认值是true，使用ip，而非hostname；

(3) logformat 使用了自定义的 Interceptor，主要为header 添加了{dayStr} {hourStr} {minStr} 以供后面的sink使用

Code

package com.test.flume;
import java.io.IOException;
import java.util.Calendar;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.formatter.output.BucketPath;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
public class FileSink extends AbstractSink implements Configurable {
        private static final Logger logger = LoggerFactory
                     . getLogger(FileSink .class );
        private String path ;
        private static final String defaultFileName = "FlumeData";
        private static final int defaultMaxOpenFiles = 50;
        /**
        * Default length of time we wait for blocking BucketWriter calls before
        * timing out the operation. Intended to prevent server hangs.
        */
        private long txnEventMax ;
        private FileWriterLinkedHashMap sfWriters ;
        private String serializerType ;
        private Context serializerContext ;
        private boolean needRounding = false;
        private int roundUnit = Calendar.SECOND;
        private int roundValue = 1;
        private SinkCounter sinkCounter ;
        private int maxOpenFiles ;
        private ScheduledExecutorService timedRollerPool ;
        private long rollInterval ;
        public void configure(Context context) {
              String directory = Preconditions. checkNotNull(
                           context.getString( "file.path"), "file.path is required");
              String fileName = context.getString( "file.filePrefix", defaultFileName);
               this.path = directory + "/" + fileName;
               maxOpenFiles = context.getInteger("file.maxOpenFiles" ,
                            defaultMaxOpenFiles);
               serializerType = context.getString("sink.serializer" , "TEXT" );
               serializerContext = new Context(
                           context.getSubProperties(EventSerializer. CTX_PREFIX));
               txnEventMax = context.getLong("file.txnEventMax" , 1l);
               if (sinkCounter == null) {
                      sinkCounter = new SinkCounter(getName());
              }
               rollInterval = context.getLong("file.rollInterval" , 30l);
              String rollerName = "hdfs-" + getName() + "-roll-timer-%d" ;
               timedRollerPool = Executors.newScheduledThreadPool( maxOpenFiles,
                            new ThreadFactoryBuilder().setNameFormat(rollerName).build());
       }
        public Status process() throws EventDeliveryException {
              Channel channel = getChannel();
              Transaction transaction = channel.getTransaction();
              List<BucketFileWriter> writers = Lists. newArrayList();
              transaction.begin();
               try {
                     Event event = null;
                      int txnEventCount = 0;
                      for (txnEventCount = 0; txnEventCount < txnEventMax; txnEventCount++) {
                           event = channel.take();
                            if (event == null) {
                                   break;
                           }
                            // reconstruct the path name by substituting place holders
                           String realPath = BucketPath
                                         . escapeString(path, event.getHeaders(), needRounding,
                                                        roundUnit, roundValue );
                           BucketFileWriter bucketFileWriter = sfWriters.get(realPath);
                            // we haven't seen this file yet, so open it and cache the
                            // handle
                            if (bucketFileWriter == null) {
                                  bucketFileWriter = new BucketFileWriter();
                                  bucketFileWriter.open(realPath, serializerType,
                                                 serializerContext, rollInterval , timedRollerPool,
                                                 sfWriters);
                                   sfWriters.put(realPath, bucketFileWriter);
                           }
                            // track the buckets getting written in this transaction
                            if (!writers.contains(bucketFileWriter)) {
                                  writers.add(bucketFileWriter);
                           }
                            // Write the data to File
                           bucketFileWriter.append(event);
                     }
                      if (txnEventCount == 0) {
                            sinkCounter.incrementBatchEmptyCount();
                     } else if (txnEventCount == txnEventMax) {
                            sinkCounter.incrementBatchCompleteCount();
                     } else {
                            sinkCounter.incrementBatchUnderflowCount();
                     }
                      // flush all pending buckets before committing the transaction
                      for (BucketFileWriter bucketFileWriter : writers) {
                            if (!bucketFileWriter.isBatchComplete()) {
                                  flush(bucketFileWriter);
                           }
                     }
                     transaction.commit();
                      if (txnEventCount > 0) {
                             sinkCounter.addToEventDrainSuccessCount(txnEventCount);
                     }
                      if (event == null) {
                            return Status.BACKOFF ;
                     }
                      return Status.READY ;
              } catch (IOException eIO) {
                     transaction.rollback();
                      logger.warn("File IO error" , eIO);
                      return Status.BACKOFF ;
              } catch (Throwable th) {
                     transaction.rollback();
                      logger.error("process failed" , th);
                      if (th instanceof Error) {
                            throw (Error) th;
                     } else {
                            throw new EventDeliveryException(th);
                     }
              } finally {
                     transaction.close();
              }
       }
        private void flush(BucketFileWriter bucketFileWriter) throws IOException {
              bucketFileWriter.flush();
       }
        @Override
        public synchronized void start() {
               super.start();
               this.sfWriters = new FileWriterLinkedHashMap(maxOpenFiles);
               sinkCounter.start();
       }
}

package com.test.flume;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.concurrent.Callable;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.serialization.EventSerializerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BucketFileWriter {
     private static final Logger logger = LoggerFactory
               .getLogger(BucketFileWriter.class);
     private static final String IN_USE_EXT = ".tmp";
     /**
     * This lock ensures that only one thread can open a file at a time.
     */
     private final AtomicLong fileExtensionCounter;
     private OutputStream outputStream;
     private EventSerializer serializer;
     private String filePath;
     /**
     * Close the file handle and rename the temp file to the permanent filename.
     * Safe to call multiple times. Logs HDFSWriter.close() exceptions.
     *
     * @throws IOException
     *             On failure to rename if temp file exists.
     */
     public BucketFileWriter() {
          fileExtensionCounter = new AtomicLong(System.currentTimeMillis());
     }
     public void open(final String filePath, String serializerType,
               Context serializerContext, final long rollInterval,
               final ScheduledExecutorService timedRollerPool,
               final FileWriterLinkedHashMap sfWriters) throws IOException {
          this.filePath = filePath;
          File file = new File(filePath + fileExtensionCounter + IN_USE_EXT);
          file.getParentFile().mkdirs();
          outputStream = new BufferedOutputStream(new FileOutputStream(file));
          logger.info("filename = " + file.getAbsolutePath());
          serializer = EventSerializerFactory.getInstance(serializerType,
                    serializerContext, outputStream);
          serializer.afterCreate();
          if (rollInterval > 0) {
               Callable<Void> action = new Callable<Void>() {
                    public Void call() throws Exception {
                         logger.debug(
                                   "Rolling file ({}): Roll scheduled after {} sec elapsed.",
                                   filePath + fileExtensionCounter + IN_USE_EXT,
                                   rollInterval);
                         if (sfWriters.containsKey(filePath)) {
                              sfWriters.remove(filePath);
                         }
                         close();
                         return null;
                    }
               };
               timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS);
          }
     }
     public void append(Event event) throws IOException {
          serializer.write(event);
     }
     public boolean isBatchComplete() {
          return true;
     }
     public void flush() throws IOException {
          serializer.flush();
          outputStream.flush();
     }
     /**
     * Rename bucketPath file from .tmp to permanent location.
     */
     private void renameBucket() {
          File srcPath = new File(filePath + fileExtensionCounter + IN_USE_EXT);
          File dstPath = new File(filePath + fileExtensionCounter);
          if (srcPath.exists()) {
               srcPath.renameTo(dstPath);
               logger.info("Renaming " + srcPath + " to " + dstPath);
          }
     }
     public synchronized void close() throws IOException, InterruptedException {
          if (outputStream != null) {
               outputStream.flush();
               outputStream.close();
          }
          renameBucket();
     }
}

package com.test.flume;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FileWriterLinkedHashMap extends
          LinkedHashMap<String, BucketFileWriter> {
     private static final Logger logger = LoggerFactory
               .getLogger(FileWriterLinkedHashMap.class);
     private static final long serialVersionUID = -7860596835613215998L;
     private final int maxOpenFiles;
     public FileWriterLinkedHashMap(int maxOpenFiles) {
          super(16, 0.75f, true); // stock initial capacity/load, access
          this.maxOpenFiles = maxOpenFiles;
     }
     @Override
     protected boolean removeEldestEntry(Entry<String, BucketFileWriter> eldest) {
          if (size() > maxOpenFiles) {
               // If we have more that max open files, then close the last one
               // and
               // return true
               try {
                    eldest.getValue().close();
               } catch (IOException e) {
                    logger.warn(eldest.getKey().toString(), e);
               } catch (InterruptedException e) {
                    logger.warn(eldest.getKey().toString(), e);
                    Thread.currentThread().interrupt();
               }
               return true;
          } else {
               return false;
          }
     }
}

package com.test.flume;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class LogFormatInterceptor implements Interceptor {
    private static final Logger logger = LoggerFactory.getLogger(LogFormatInterceptor.class);
    private SimpleDateFormat sd = null;
    private SimpleDateFormat sh = null;
    private SimpleDateFormat sm = null;
    public LogFormatInterceptor() {
    }
    public void close() {
    }
    public void initialize() {
        sd = new SimpleDateFormat("yyyyMMdd");
        sh = new SimpleDateFormat("HH");
        sm = new SimpleDateFormat("mm");
    }
    public Event intercept(Event event) {
        try {
            Map<String, String> headers = event.getHeaders();
            String body = new String(event.getBody());
            Date date = new Date();
            headers.put("dayStr", sd.format(date));
            headers.put("hourStr", sh.format(date));
            Integer m = Integer.parseInt(sm.format(date));
            String min = "";
            if (m >= 0 && m < 10) {
                min = "0" + (m / 5) * 5;
            } else {
                min = (m / 5) * 5 + "";
            }
            headers.put("minStr", min);
            Event e = EventBuilder.withBody(body.getBytes(), headers);
            return e;
        } catch (Exception e) {
            logger.error("LogFormat error!", e);
        }
        return null;
    }
    public List<Event> intercept(List<Event> events) {
        List<Event> list = new LinkedList<Event>();
        for (Event event : events) {
            Event e = intercept(event);
            if (e != null) {
                list.add(e);
            }
        }
        return list;
    }
    /**
     * Builder which builds new instances of the HostInterceptor.
     */
    public static class Builder implements Interceptor.Builder {
        public Interceptor build() {
            return new LogFormatInterceptor();
        }
        public void configure(Context context) {}
    }
}

PlainJSONHandler

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.test.flume;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.JSONEvent;
import org.apache.flume.source.http.HTTPBadRequestException;
import org.apache.flume.source.http.HTTPSourceHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.JsonParseException;
import com.google.gson.JsonParser;
/**
 * PlainJSONHandler for HTTPSource that accepts json-based http body.
 *
 * This handler throws exception if the deserialization fails because of bad
 * format or any other reason.
 */
public class PlainJSONHandler implements HTTPSourceHandler {
   
  private static final String FORWARD_HEADERS = "forwardHeaders";
  private static final Logger LOG =
    LoggerFactory.getLogger(PlainJSONHandler.class);
  private static JsonParser parser = new JsonParser();
  private static Set<String> forwardHeaders = new HashSet<String>();
  public List<Event> getEvents(HttpServletRequest request) throws Exception {
    Map<String,String> eventHeaders = new HashMap<String,String>();
    Enumeration requestHeaders = request.getHeaderNames();
    while (requestHeaders.hasMoreElements()) {
      String header = (String) requestHeaders.nextElement();
      if (forwardHeaders.contains(header)) {
        eventHeaders.put(header, request.getHeader(header));
      }
    }
    BufferedReader reader = request.getReader();
    List<Event> eventList = new ArrayList<Event>(1);
//    String line = reader.readLine();
    StringBuffer lineBuffer = new StringBuffer();
    boolean tag;
    do {
        lineBuffer.append(reader.readLine());
    } while (tag = reader.read() != -1);
    if (lineBuffer != null) {
      /*try {
        parser.parse(line);
      } catch (JsonParseException ex) {
        throw new HTTPBadRequestException(
          "HTTP body is not a valid JSON object.", ex);
      }*/
      Event event = new JSONEvent();
      event.setBody(lineBuffer.toString().getBytes());
      event.setHeaders(eventHeaders);
      eventList.add(event);
       
      LOG.info("========= Event body:" + new String(event.getBody()) + "==============");
    }
    return eventList;
  }
  public void configure(Context context) {
    String confForwardHeaders = context.getString(FORWARD_HEADERS);
    if (confForwardHeaders != null) {
      if (forwardHeaders.addAll(Arrays.asList(confForwardHeaders.split(",")))) {
        LOG.debug("forwardHeaders=" + forwardHeaders);
      } else {
        LOG.error("error to get forward headers from " + confForwardHeaders);
      }
    } else {
      LOG.debug("no forwardHeaders");
    }
  }
}