《自己动手建搜索引擎》日志分析类代码解析与修正为兼容lucene3.0.2

本文介绍了一种简单的搜索日志分析方法,包括如何提取关键信息如搜索关键字、IP地址及搜索结果数量等,并通过示例展示了日志的格式与处理流程。然而,该方法存在一定的局限性,难以满足复杂场景下的需求。

搜索日志是用来分析用户搜索行为和信息需求的重要依据。一般记录如下信息:

  • 搜索关键字
  • 用户来源IP
  • 本次搜索返回结果数量
  • 搜索时间
  • 其他需要记录的应用相关信息

   例如:搜索时间|日志类型|搜索类型|搜索关键字|IP地址|本次搜索返回结果数量, 存放日志文件内容如下:

2008-04-04 12:01:19.2876|DEBUG|blog|title:瑞丽女性网|222.130.192.109|8
2008-04-04 12:01:22.1626|DEBUG|blog|title:瑞丽女性网|222.130.192.109|8
2008-04-04 12:01:35.0376|DEBUG|blog||222.130.192.109|10
2008-04-04 12:01:44.0688|DEBUG|blog||222.130.192.109|10
2008-04-04 12:03:31.1938|DEBUG|blog|清明,祭奠我所有的过去...|222.130.192.109|1
2008-04-04 12:37:19.7720|DEBUG|blog|清明|222.130.192.109|10
2008-04-04 12:37:39.7563|DEBUG|blog|清明|222.130.192.109|10
2008-04-04 12:39:08.8657|DEBUG|blog|清明|222.130.192.109|10
2008-04-04 12:42:12.6313|DEBUG|blog|清明|222.130.192.109|10
2008-04-04 12:42:19.4282|DEBUG|blog|清明|222.130.192.109|10
2008-04-04 12:42:46.8657|DEBUG|blog|清明|222.130.192.109|10
2008-04-04 12:48:23.8813|DEBUG|blog|可见要想吃上这里的爆肚还要赶早不赶晚啊|222.130.192.109|1
2008-04-04 15:55:57.1470|DEBUG|blog|aaa|127.0.0.1|10
2008-04-04 15:57:23.4282|DEBUG|blog|aaa|222.130.192.109|10
2008-04-04 16:06:40.1626|DEBUG|blog|汽车|222.130.192.109|10
2008-04-04 16:06:46.7563|DEBUG|blog|汽车|222.130.192.109|10

《自己动手写搜索引擎》日志分析源代码解析:

package org.apache.log;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.HashSet;
//这里是分析搜索日志的部分
public class SearchLog2File {
 /**
  * @param args
  */

 public static void main(String[] args) throws Exception {
  String logPath = "E:/Java Projects/ses/src/test/lucene/dic/log"; //日志文件所在目录
  String searchWords = "E:/Java Projects/ses/src/test/lucene/dic/searchword/searchWords1.txt"; //日志分析后数据存放路径
  
  logFiler(logPath,searchWords);
 }
 
 /**
  * 日志分析
  * @param logPath 日志文件存放目录
  * @param searchWords 日志分析后数据存放文件路径
  * @throws Exception
  */

 public static void logFiler(String logPath,String searchWords) throws Exception {
  
  FileOutputStream fos = new FileOutputStream(searchWords);
  OutputStreamWriter osw = new OutputStreamWriter(fos,"GBK");
  BufferedWriter bw = new BufferedWriter(osw);
  
  File file = new File(logPath);
  File[] fileArray = file.listFiles();
  
  String readline; //读取一行
  String strIP ; // 存放IP地址
  String strCont ; // 存放搜索内容
  
  for (int i = 0; i < fileArray.length; i++) {
   String fileName = fileArray[i].getName();
   if (!(fileName.endsWith(".txt")))
    continue;
   HashMap<String,HashSet<String>> word2IP =
    new HashMap<String,HashSet<String>>(); //存放IP
   HashMap<String,Integer> word2ResultNum = new HashMap<String,Integer>(); //存放命中个数
   
   String fileDate = fileName.substring(0,10);
   System.out.println(fileDate);
   
   FileInputStream fileInputStream = new FileInputStream(logPath + "/"
     + fileName);
   InputStreamReader fsr = new InputStreamReader(fileInputStream);
   BufferedReader br = new BufferedReader(fsr);
   //处理在日志中经常出现的乱码
   while ((readline = br.readLine()) != null) {
    if (readline.indexOf("?") >= 0)
     continue;
    if (readline.indexOf(",") >= 0)
     continue;
    if (readline.indexOf("=") >= 0)
     continue;
    if (readline.indexOf("骞") >= 0)
     continue;
    if (readline.indexOf("鐭") >= 0)
     continue;
    if (readline.indexOf("︾") >= 0)
     continue;
    if (readline.indexOf("鐢佃") >= 0)
     continue;
    if (readline.indexOf("╂") >= 0)
     continue;
    if (readline.indexOf("鐜") >= 0)
     continue;
    if (readline.indexOf("鶶") >= 0)
     continue;
    if (readline.indexOf("^") >= 0)
     continue;
    if (readline.indexOf("廸") >= 0)
     continue;
    if (readline.indexOf("閸") >= 0)
     continue;
    if (readline.indexOf("嬭") >= 0)
     continue;
    if (readline.indexOf("鍟") >= 0)
     continue;
    if (readline.indexOf("鏂") >= 0)
     continue;
    if (readline.indexOf("籂") >= 0)
     continue;
    if (readline.indexOf("濞") >= 0)
     continue;
    if (readline.indexOf("鐑") >= 0)
     continue;
    if (readline.indexOf("瓙") >= 0)
     continue;
    if (readline.indexOf("ユ") >= 0)
     continue;
    if (readline.indexOf("磿") >= 0)
     continue;
    if (readline.indexOf("嬫") >= 0)
     continue;
    if (readline.indexOf("傚") >= 0)
     continue;
    if (readline.indexOf("鐥") >= 0)
     continue;
    if (readline.indexOf("滅") >= 0)
     continue;
    if (readline.indexOf("閻") >= 0)
     continue;
    if (readline.indexOf("彛") >= 0)
     continue;
    if (readline.indexOf("寮") >= 0)
     continue;
    if (readline.indexOf("儤") >= 0)
     continue;
    if (readline.indexOf("闁") >= 0)
     continue;
    if (readline.indexOf("闈") >= 0)
     continue;
    if (readline.indexOf("湇") >= 0)
     continue;
    if (readline.indexOf("鍏") >= 0)
     continue;
    if (readline.indexOf("潡") >= 0)
     continue;
    if (readline.indexOf("庡") >= 0)
     continue;
    if (readline.indexOf("笅") >= 0)
     continue;
    if (readline.indexOf("鐣") >= 0)
     continue;
    if (readline.indexOf("冩") >= 0)
     continue;
    if (readline.indexOf("撳") >= 0)
     continue;
    if (readline.indexOf("鏉") >= 0)
     continue;
    if (readline.indexOf("彿") >= 0)
     continue;
    if (readline.indexOf("搧") >= 0)
     continue;
    if (readline.indexOf("笅") >= 0)
     continue;
    if (readline.indexOf("鎺") >= 0)
     continue;
    if (readline.indexOf("闂") >= 0)
     continue;
    if (readline.indexOf("閺") >= 0)
     continue;
    if (readline.indexOf("墖") >= 0)
     continue;
    if (readline.indexOf("夎") >= 0)
     continue;
    if (readline.indexOf("浜") >= 0)
     continue;
    if (readline.indexOf("褰") >= 0)
     continue;
    if (readline.indexOf("锟斤") >= 0)
     continue;
    if (readline.indexOf("AND ") >= 0)
     continue;
    
    //
    // System.out.println("readline:"+readline);
    //日志格式如:2008-04-04 16:06:40.1626|DEBUG|blog|汽车|222.130.192.109|10
    StringTokenizer st = new StringTokenizer(readline, "|");
    
    if (st.hasMoreTokens()) {
     st.nextToken(); //2008-04-04 16:06:40.1626
     if (!st.hasMoreTokens())
      continue;
     st.nextToken(); //DEBUG
     if (!st.hasMoreTokens())
      continue;
     st.nextToken(); //blog
     if (!st.hasMoreTokens())
      continue;
     strCont = st.nextToken(); //汽车
     
     //下面几行代码主要处理:当检索“汽车 汽车”则当做“汽车”;但是也只能处理这种情况,如“汽车 美容 汽车”它也就不会处理成“汽车 美容”
     StringTokenizer stQuery = new StringTokenizer(strCont, " ");
     String key1;
     String key2;
     if(stQuery.hasMoreTokens())  
     {
      key1 = stQuery.nextToken();
      if(stQuery.hasMoreTokens())
      {
       key2 = stQuery.nextToken();
       if(key1.equals(key2))
       {
        strCont = key1;
        //System.out.println(strCont);
       }
      }
     }
     if(strCont.length()>20 || strCont.length()<2 ) //收录的检索关键字长度限制在2-20字符之间
     {
      continue;
     }     
     if (strCont.indexOf(":") >= 0)//过滤掉这种记录:2008-04-04 12:01:19.2876|DEBUG|blog|title:瑞丽女性网|222.130.192.109|8
      continue;
     if (!st.hasMoreTokens())
      continue;
     strIP = st.nextToken(); //222.130.192.109 IP地址
     if (!st.hasMoreTokens())
      continue;
     int resultNum = 0;
     try
     {
      resultNum = Integer.parseInt(st.nextToken()); //10 命中个数
     }
     catch(NumberFormatException e)
     {}
     
     //System.out.println(strIP);
     if (word2IP.containsKey(strCont)) {//检索关键字已记录
      HashSet<String> ips = word2IP.get(strCont);
      ips.add(strIP); //则根据IP地址当做频率数,相同IP检索相同的关键字则认作一次操作
     } else if(resultNum>0) {
      HashSet<String> ips = new HashSet<String>();
      ips.add(strIP);
      word2IP.put(strCont, ips);
      word2ResultNum.put(strCont, resultNum); //记录命中数,这里并没有考虑可能下一次检索出来的命中数与当前命中数不同的情况
     }
    }
   }
   
   //写入searchWords文件中或插入数据库日志表,在下面可根据自己的要求修改
   for (java.util.Map.Entry<String,HashSet<String>> e :
       word2IP.entrySet()) {
    //stmt.setString(1, e.getKey());
    //stmt.setInt(2, e.getValue().size());
    //stmt.setInt(3, word2ResultNum.get(e.getKey()));
    //stmt.setDate(4, new Date((searchDate).getTime()));
    //stmt.executeUpdate();
    
    bw.write(e.getKey()+"%" +
    word2ResultNum.get(e.getKey()) + "%" +
    e.getValue().size());
    bw.write("/n");
    
    //System.out.println(e.getKey()+"%" +
    //  word2ResultNum.get(e.getKey()) + "%" +
    //  e.getValue().size());
    
   }
  }
  bw.close();
 }
}

      从代码可看出日志的内容存在很大局限性,可适用于简单日志利用,但很难满足企业搜索或商务网站搜索的需求。

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值