HttpSource -> File Roll Sink
FileRollSink 可以将source中的数据存如本地,但FileRollSink文件名不能自定义,而且不能定时滚动文件,只能按时间间隔滚动,可以自己定义sink,来做定时写文件,参考了 http://blog.csdn.net/tswisdom/article/details/41483471 这篇文章。
本文利用文件Sink,实现了按固定格式目录输出,收集完成结果存放需要按天生成文件夹,按每5分钟生成文件,比如2012年12月29日12点26分的日志,需要放到/data/flume_test/20121229/log-1225-对应的文件中。
配置文件
a1.sources=r1
a1.sinks=k1
a1.channels=c1
a1.sources.r1.type=http
a1.sources.r1.bind=localhost
a1.sources.r1.port=50000
a1.sources.r1.channels=c1
a1.sources.r1.handler=com.test.flume.PlainJSONHandler
a1.sources.r1.interceptors = i1 i2 logformat
a1.sources.r1.interceptors.i1.preserveExisting = true
a1.sources.r1.interceptors.i1.type = timestamp
a1.sources.r1.interceptors.i2.type = host
a1.sources.r1.interceptors.i2.hostHeader = hostname
a1.sources.r1.interceptors.logformat.type = com.test.flume.LogFormatInterceptor$Builder
a1.sinks.k1.channel=c1
a1.sinks.k1.type=com.test.flume.FileSink
#a1.sinks.k1.sink.directory=/data/flume_test2/
#a1.sinks.k1.sink.rollInterval = 3600
a1.sinks.k1.sink.batchSize=100
a1.sinks.k1.file.path = /data/flume_test2/%{dayStr}
a1.sinks.k1.file.filePrefix = log-%{hourStr}%{minStr}-
a1.sinks.k1.file.txnEventMax = 10000
a1.sinks.k1.file.maxOpenFiles = 5
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
注:拦截器 Interceptors 设置在Source 和Source 写入数据的Channel 之间,Source 接收到的事件在写入到对应的Channel之前,拦截器都可以转换或者删除这些事件,这里使用了三个 interceptor : i1, i2, logformat,
(1) i1 是一个Timestamp Interceptor,会在在event 的 header 里添加了 timestamp, preserveExisting 的 默认值是false,其作用是如果timestamp已经存在,应该被保护;
(2) i2 是一个 Host Interceptor,会在在event 的 header 里添加了 host, hostHeader 的默认值是host,是header的key名称, Host Interceptor( i2 ) 还有个属性为 useIP 默认值是true,使用ip,而非hostname;
(3) logformat 使用了自定义的 Interceptor, 主要为header 添加了{dayStr} {hourStr} {minStr} 以供 后面的sink使用
Code
package com.test.flume;
import java.io.IOException;
import java.util.Calendar;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.formatter.output.BucketPath;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
public class FileSink extends AbstractSink implements Configurable {
private static final Logger logger = LoggerFactory
. getLogger(FileSink .class );
private String path ;
private static final String defaultFileName = "FlumeData";
private static final int defaultMaxOpenFiles = 50;
/**
* Default length of time we wait for blocking BucketWriter calls before
* timing out the operation. Intended to prevent server hangs.
*/
private long txnEventMax ;
private FileWriterLinkedHashMap sfWriters ;
private String serializerType ;
private Context serializerContext ;
private boolean needRounding = false;
private int roundUnit = Calendar.SECOND;
private int roundValue = 1;
private SinkCounter sinkCounter ;
private int maxOpenFiles ;
private ScheduledExecutorService timedRollerPool ;
private long rollInterval ;
public void configure(Context context) {
String directory = Preconditions. checkNotNull(
context.getString( "file.path"), "file.path is required");
String fileName = context.getString( "file.filePrefix", defaultFileName);
this.path = directory + "/" + fileName;
maxOpenFiles = context.getInteger("file.maxOpenFiles" ,
defaultMaxOpenFiles);
serializerType = context.getString("sink.serializer" , "TEXT" );
serializerContext = new Context(
context.getSubProperties(EventSerializer. CTX_PREFIX));
txnEventMax = context.getLong("file.txnEventMax" , 1l);
if (sinkCounter == null) {
sinkCounter = new SinkCounter(getName());
}
rollInterval = context.getLong("file.rollInterval" , 30l);
String rollerName = "hdfs-" + getName() + "-roll-timer-%d" ;
timedRollerPool = Executors.newScheduledThreadPool( maxOpenFiles,
new ThreadFactoryBuilder().setNameFormat(rollerName).build());
}
public Status process() throws EventDeliveryException {
Channel channel = getChannel();
Transaction transaction = channel.getTransaction();
List<BucketFileWriter> writers = Lists. newArrayList();
transaction.begin();
try {
Event event = null;
int txnEventCount = 0;
for (txnEventCount = 0; txnEventCount < txnEventMax; txnEventCount++) {
event = channel.take();
if (event == null) {
break;
}
// reconstruct the path name by substituting place holders
String realPath = BucketPath
. escapeString(path, event.getHeaders(), needRounding,
roundUnit, roundValue );
BucketFileWriter bucketFileWriter = sfWriters.get(realPath);
// we haven't seen this file yet, so open it and cache the
// handle
if (bucketFileWriter == null) {
bucketFileWriter = new BucketFileWriter();
bucketFileWriter.open(realPath, serializerType,
serializerContext, rollInterval , timedRollerPool,
sfWriters);
sfWriters.put(realPath, bucketFileWriter);
}
// track the buckets getting written in this transaction
if (!writers.contains(bucketFileWriter)) {
writers.add(bucketFileWriter);
}
// Write the data to File
bucketFileWriter.append(event);
}
if (txnEventCount == 0) {
sinkCounter.incrementBatchEmptyCount();
} else if (txnEventCount == txnEventMax) {
sinkCounter.incrementBatchCompleteCount();
} else {
sinkCounter.incrementBatchUnderflowCount();
}
// flush all pending buckets before committing the transaction
for (BucketFileWriter bucketFileWriter : writers) {
if (!bucketFileWriter.isBatchComplete()) {
flush(bucketFileWriter);
}
}
transaction.commit();
if (txnEventCount > 0) {
sinkCounter.addToEventDrainSuccessCount(txnEventCount);
}
if (event == null) {
return Status.BACKOFF ;
}
return Status.READY ;
} catch (IOException eIO) {
transaction.rollback();
logger.warn("File IO error" , eIO);
return Status.BACKOFF ;
} catch (Throwable th) {
transaction.rollback();
logger.error("process failed" , th);
if (th instanceof Error) {
throw (Error) th;
} else {
throw new EventDeliveryException(th);
}
} finally {
transaction.close();
}
}
private void flush(BucketFileWriter bucketFileWriter) throws IOException {
bucketFileWriter.flush();
}
@Override
public synchronized void start() {
super.start();
this.sfWriters = new FileWriterLinkedHashMap(maxOpenFiles);
sinkCounter.start();
}
}
package com.test.flume;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.concurrent.Callable;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.serialization.EventSerializerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BucketFileWriter {
private static final Logger logger = LoggerFactory
.getLogger(BucketFileWriter.class);
private static final String IN_USE_EXT = ".tmp";
/**
* This lock ensures that only one thread can open a file at a time.
*/
private final AtomicLong fileExtensionCounter;
private OutputStream outputStream;
private EventSerializer serializer;
private String filePath;
/**
* Close the file handle and rename the temp file to the permanent filename.
* Safe to call multiple times. Logs HDFSWriter.close() exceptions.
*
* @throws IOException
* On failure to rename if temp file exists.
*/
public BucketFileWriter() {
fileExtensionCounter = new AtomicLong(System.currentTimeMillis());
}
public void open(final String filePath, String serializerType,
Context serializerContext, final long rollInterval,
final ScheduledExecutorService timedRollerPool,
final FileWriterLinkedHashMap sfWriters) throws IOException {
this.filePath = filePath;
File file = new File(filePath + fileExtensionCounter + IN_USE_EXT);
file.getParentFile().mkdirs();
outputStream = new BufferedOutputStream(new FileOutputStream(file));
logger.info("filename = " + file.getAbsolutePath());
serializer = EventSerializerFactory.getInstance(serializerType,
serializerContext, outputStream);
serializer.afterCreate();
if (rollInterval > 0) {
Callable<Void> action = new Callable<Void>() {
public Void call() throws Exception {
logger.debug(
"Rolling file ({}): Roll scheduled after {} sec elapsed.",
filePath + fileExtensionCounter + IN_USE_EXT,
rollInterval);
if (sfWriters.containsKey(filePath)) {
sfWriters.remove(filePath);
}
close();
return null;
}
};
timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS);
}
}
public void append(Event event) throws IOException {
serializer.write(event);
}
public boolean isBatchComplete() {
return true;
}
public void flush() throws IOException {
serializer.flush();
outputStream.flush();
}
/**
* Rename bucketPath file from .tmp to permanent location.
*/
private void renameBucket() {
File srcPath = new File(filePath + fileExtensionCounter + IN_USE_EXT);
File dstPath = new File(filePath + fileExtensionCounter);
if (srcPath.exists()) {
srcPath.renameTo(dstPath);
logger.info("Renaming " + srcPath + " to " + dstPath);
}
}
public synchronized void close() throws IOException, InterruptedException {
if (outputStream != null) {
outputStream.flush();
outputStream.close();
}
renameBucket();
}
}
package com.test.flume;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FileWriterLinkedHashMap extends
LinkedHashMap<String, BucketFileWriter> {
private static final Logger logger = LoggerFactory
.getLogger(FileWriterLinkedHashMap.class);
private static final long serialVersionUID = -7860596835613215998L;
private final int maxOpenFiles;
public FileWriterLinkedHashMap(int maxOpenFiles) {
super(16, 0.75f, true); // stock initial capacity/load, access
this.maxOpenFiles = maxOpenFiles;
}
@Override
protected boolean removeEldestEntry(Entry<String, BucketFileWriter> eldest) {
if (size() > maxOpenFiles) {
// If we have more that max open files, then close the last one
// and
// return true
try {
eldest.getValue().close();
} catch (IOException e) {
logger.warn(eldest.getKey().toString(), e);
} catch (InterruptedException e) {
logger.warn(eldest.getKey().toString(), e);
Thread.currentThread().interrupt();
}
return true;
} else {
return false;
}
}
}
package com.test.flume;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class LogFormatInterceptor implements Interceptor {
private static final Logger logger = LoggerFactory.getLogger(LogFormatInterceptor.class);
private SimpleDateFormat sd = null;
private SimpleDateFormat sh = null;
private SimpleDateFormat sm = null;
public LogFormatInterceptor() {
}
public void close() {
}
public void initialize() {
sd = new SimpleDateFormat("yyyyMMdd");
sh = new SimpleDateFormat("HH");
sm = new SimpleDateFormat("mm");
}
public Event intercept(Event event) {
try {
Map<String, String> headers = event.getHeaders();
String body = new String(event.getBody());
Date date = new Date();
headers.put("dayStr", sd.format(date));
headers.put("hourStr", sh.format(date));
Integer m = Integer.parseInt(sm.format(date));
String min = "";
if (m >= 0 && m < 10) {
min = "0" + (m / 5) * 5;
} else {
min = (m / 5) * 5 + "";
}
headers.put("minStr", min);
Event e = EventBuilder.withBody(body.getBytes(), headers);
return e;
} catch (Exception e) {
logger.error("LogFormat error!", e);
}
return null;
}
public List<Event> intercept(List<Event> events) {
List<Event> list = new LinkedList<Event>();
for (Event event : events) {
Event e = intercept(event);
if (e != null) {
list.add(e);
}
}
return list;
}
/**
* Builder which builds new instances of the HostInterceptor.
*/
public static class Builder implements Interceptor.Builder {
public Interceptor build() {
return new LogFormatInterceptor();
}
public void configure(Context context) {}
}
}
PlainJSONHandler
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.test.flume;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.JSONEvent;
import org.apache.flume.source.http.HTTPBadRequestException;
import org.apache.flume.source.http.HTTPSourceHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.JsonParseException;
import com.google.gson.JsonParser;
/**
* PlainJSONHandler for HTTPSource that accepts json-based http body.
*
* This handler throws exception if the deserialization fails because of bad
* format or any other reason.
*/
public class PlainJSONHandler implements HTTPSourceHandler {
private static final String FORWARD_HEADERS = "forwardHeaders";
private static final Logger LOG =
LoggerFactory.getLogger(PlainJSONHandler.class);
private static JsonParser parser = new JsonParser();
private static Set<String> forwardHeaders = new HashSet<String>();
public List<Event> getEvents(HttpServletRequest request) throws Exception {
Map<String,String> eventHeaders = new HashMap<String,String>();
Enumeration requestHeaders = request.getHeaderNames();
while (requestHeaders.hasMoreElements()) {
String header = (String) requestHeaders.nextElement();
if (forwardHeaders.contains(header)) {
eventHeaders.put(header, request.getHeader(header));
}
}
BufferedReader reader = request.getReader();
List<Event> eventList = new ArrayList<Event>(1);
// String line = reader.readLine();
StringBuffer lineBuffer = new StringBuffer();
boolean tag;
do {
lineBuffer.append(reader.readLine());
} while (tag = reader.read() != -1);
if (lineBuffer != null) {
/*try {
parser.parse(line);
} catch (JsonParseException ex) {
throw new HTTPBadRequestException(
"HTTP body is not a valid JSON object.", ex);
}*/
Event event = new JSONEvent();
event.setBody(lineBuffer.toString().getBytes());
event.setHeaders(eventHeaders);
eventList.add(event);
LOG.info("========= Event body:" + new String(event.getBody()) + "==============");
}
return eventList;
}
public void configure(Context context) {
String confForwardHeaders = context.getString(FORWARD_HEADERS);
if (confForwardHeaders != null) {
if (forwardHeaders.addAll(Arrays.asList(confForwardHeaders.split(",")))) {
LOG.debug("forwardHeaders=" + forwardHeaders);
} else {
LOG.error("error to get forward headers from " + confForwardHeaders);
}
} else {
LOG.debug("no forwardHeaders");
}
}
}