自制简易搜索引擎-CSDN博客

本文链接：https://blog.csdn.net/zpf1217/article/details/5811282

本文介绍了一款基于字典树数据结构的简易搜索引擎实现过程。该引擎能够根据给定的初始URL抓取网页，提取链接并递归地搜索含有特定关键字的页面，同时计算页面的相关性权重。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

根据上一篇文章，想继续做点东西，就重新更新了数据结构，做了一个小型的搜索引擎。

主要功能，给一个初始URL，以此URL向外发散，根据是否有关键字继续页面信息，并会计算页面的PR值（并未使用GOOGLE的pr算法，自己简单计算），最后输出含有关键字的页面（从大到小，关键字要手动指定）。

PS：因为网页太大了，可以指定搜N层，N层后停止。

存储页面URL用字典树，有个关键字列表，根据列表中的信息搜索。

贴代码：呵呵

主类：定义基本信息和开始

package advance; import java.util.ArrayList; import java.util.Timer; import com.sun.org.apache.bcel.internal.generic.NEW; public class Robot { public static ArrayList<URLObj> urlNeedSearchList= new ArrayList<URLObj>(); public static ArrayList<KeyWord> keyWordList = new ArrayList<KeyWord>(); public Robot() { // urlNeedSearchList = new ArrayList<URLObj>(); } public static void main(String[] args) { String urlString ="http://www.hapistar.com"; URLObj urlObj = Dictionary.getInstance().setURLs(urlString); urlObj.index =1; urlNeedSearchList.add(urlObj); KeyWord keyWord = new KeyWord("淫荡"); keyWordList.add(keyWord); Thread[] http = new Thread[10]; for (int i = 0; i < http.length; i++) { http[i] = new Thread(new HTTP()); http[i].start(); } try { Thread.sleep(10000); for (int i = 0; i < http.length; i++) { http[i].stop(); } } catch (InterruptedException e) { e.printStackTrace(); } KeyWordValue value =keyWordList.get(0).value; while (value!= null) { System.out.println(value.leafNode.getUrlByLeafNode()+" times = "+value.timeApperance); value =value.nextValue; } } }

字典树实现：

package advance; import java.util.ArrayList; public class Dictionary { private static Dictionary instance; public static DictionaryNode root ; public Dictionary() { root = new DictionaryNode(); } public static Dictionary getInstance() { if (instance == null) { instance = new Dictionary(); } return instance; } public DictionaryNode getLeafNodeByURL(String urlString) { //对字符串做前期处理 urlString = urlString.trim(); if (urlString.endsWith("/")) { urlString = urlString.substring(0,urlString.length()-1); } DictionaryNode currentNode = root; int urlLength = urlString.length(); for (int i = 0; i < urlLength ; i++) { char value = urlString.charAt(i); if (value> DictionaryNode.startIndex && value < DictionaryNode.endIndex) //在区间内 { if (currentNode.childNodes[value-DictionaryNode.startIndex ] == null) //不存在此节点，就添加 { return null; } currentNode = currentNode.childNodes[value-DictionaryNode.startIndex]; } } return currentNode; } /** * 插入URL 返回URLOBJ，包含叶子结点和是否已存在 * @param urlString * @return URLObj */ public URLObj setURLs(String urlString) { boolean isExist = false; //对字符串做前期处理 urlString = urlString.trim(); if (urlString.endsWith("/")) { urlString = urlString.substring(0,urlString.length()-1); } DictionaryNode currentNode = root; //对URL每一个字符进行遍历，添加到字典树中 int urlLength = urlString.length(); for (int i = 0; i < urlLength ; i++) { //最后一个字符有些特殊处理，所以作了区分 if (i < urlLength-1) { char value = urlString.charAt(i); if (value> DictionaryNode.startIndex && value < DictionaryNode.endIndex) //在区间内 { if (currentNode.childNodes[value-DictionaryNode.startIndex ] == null) //不存在此节点，就添加 { DictionaryNode tempNode = new DictionaryNode(); tempNode.nodeValue = value; tempNode.fathorNode = currentNode; currentNode.childNodes[value-DictionaryNode.startIndex] = tempNode; } currentNode = currentNode.childNodes[value-DictionaryNode.startIndex]; } else { break; } } else { //最后一个字符要做特殊操作 char value = urlString.charAt(urlLength-1); if (value> DictionaryNode.startIndex && value < DictionaryNode.endIndex) //在区间内 { if (currentNode.childNodes[value-DictionaryNode.startIndex ] == null) //不存在此节点，就添加 { DictionaryNode tempNode = new DictionaryNode(); tempNode.nodeValue = value; tempNode.fathorNode = currentNode; currentNode.childNodes[value-DictionaryNode.startIndex] = tempNode; } else { isExist = true; //如果在叶子节点都是已有节点，那么这个URL是已经存在的啦 } currentNode = currentNode.childNodes[value-DictionaryNode.startIndex]; currentNode.isLeaf = true; } } } URLObj returnObj = new URLObj(); returnObj.urlNode = currentNode; returnObj.isAlreadyExist = isExist; return returnObj; } /** * 返回以root为根结点下所有的叶子结点，因为我们的N叉树记录了父结点，所以只需要找到叶子结点，就能向上找到整个URL * @param root * @param allLeafNodes 记录结果的List * 深度遍历 */ public void getAllLeafNode(DictionaryNode root ,ArrayList<DictionaryNode> allLeafNodes) { if (root.isLeaf) { allLeafNodes.add(root); return; } for (int i = 0; i < root.childNodes.length; i++) { if (root.childNodes[i]!= null) { getAllLeafNode(root.childNodes[i],allLeafNodes); } } } }

字典NODE：

package advance; public class DictionaryNode { /** URL中第一个可用字符的ASCII码 33是'!',122是'z',用ASCII码-startIndex来作为nextNodes的索引 */ public static int startIndex = 33, endIndex = 122; char nodeValue = (char) -1; boolean isLeaf =false; int weightValue = 1; /** N叉树。。 */ DictionaryNode[] childNodes; DictionaryNode fathorNode = null; public DictionaryNode() { childNodes = new DictionaryNode[endIndex - startIndex]; } /** * 通过叶子结点反向解析得到URL * @param leafNode * @return */ public String getUrlByLeafNode() { StringBuffer sBuffer = new StringBuffer(); if (!isLeaf) { return null; } DictionaryNode nextValueNode = this; while (nextValueNode!=null) { sBuffer.append(nextValueNode.nodeValue); nextValueNode = nextValueNode.fathorNode; } sBuffer= sBuffer.reverse(); sBuffer.deleteCharAt(0); return sBuffer.toString(); } }

搜索网页逻辑：

package advance; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HTTP implements Runnable { StringBuffer textStringBuffer; String patternStrs=" href ?= ?/"(.*?)/""; String keyWordString = Robot.keyWordList.get(0).keyWordString; public String urlString =null; URLDownPic downPic= null; /** 搜索的深度，如果为1就只搜索当前页面的页面或其他 */ public static int searchDepth = 1; /** 线程如果发现没有队列中没有URL，就睡眠一次，睡N次后，就中止线程 */ int sleepTimes = 0; int sleepMaxTime = 10; public HTTP(String url) { downPic = new URLDownPic(); textStringBuffer = new StringBuffer(); urlString = url; } public HTTP() { downPic = new URLDownPic(); textStringBuffer = new StringBuffer(); } public String getText(String url) { try { String urlName = url; URL U = new URL(urlName); URLConnection connection = U.openConnection(); //设置超时等待时间为两秒 connection.setConnectTimeout(2000); connection.connect(); BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line; while ((line = in.readLine()) != null) { textStringBuffer.append(line); textStringBuffer.append("/n"); } in.close(); return textStringBuffer.toString(); } catch (Exception e) { e.printStackTrace(); } return null; } public void go(URLObj urlObj) { //根据url得到HTTP流 String httpString = getText(urlObj.urlNode.getUrlByLeafNode()); //getURL出现异常，会返回NULL if (httpString ==null) { return; } // System.out.println(httpString); //跟据正则表达式获得网址，一类是图片，一类是网页网址，遍历找啊！ Pattern p=Pattern.compile(patternStrs); if (p == null) //没有找到就返回 { return; } Matcher m=p.matcher(httpString); while (m!=null && m.find()) { // String tempS= httpString.substring(m.start(),m.end()-1); String tempS = m.group(1); //如果是相对路径就转成绝对路径 if (!tempS.startsWith("http")) { StringBuffer sBuffer = new StringBuffer(urlObj.urlNode.getUrlByLeafNode()); if (!(tempS.charAt(0)=='/')) { sBuffer.append('/'); } sBuffer.append(tempS); tempS = sBuffer.toString(); } //找到的有可能是重复的,重复计算完权值后就跳过 URLObj setUrlResultObj = Dictionary.getInstance().setURLs(tempS); DictionaryNode leafNode = setUrlResultObj.urlNode; //根据各种算法计算网页的权值，呵呵 WeightValue.calculate(urlObj.urlNode,leafNode); if (setUrlResultObj.isAlreadyExist) { continue; } //如果超过设置的深度也跳过 if (urlObj.index>searchDepth) { continue; } //加锁，防止添加时有别的线程删除 synchronized (Robot.urlNeedSearchList) { Robot.urlNeedSearchList.add(new URLObj(leafNode,urlObj.index+1)); } System.out.println(tempS); } //关键字，哈哈！ p=Pattern.compile(keyWordString); if (p == null) //没有找到就返回 { return; } m=p.matcher(httpString); //测试出现了多少次，如果>1就记录网页数据 int times = 0; while (m!=null && m.find()) { times++; } if (times>0) { KeyWordValue tempValue = new KeyWordValue(); tempValue.timeApperance = times; tempValue.leafNode = urlObj.urlNode; Robot.keyWordList.get(0).insertValue(tempValue); } } /** * 一直循环看是否有新URL，有的话就GO！ * 没有有SLEEP，如果超过N次就意味着没有了，嘿嘿，虽然不会出现，停止,exit */ @Override public void run() { boolean isHasNewUrl = false; while (true) { try { if (Robot.urlNeedSearchList.size()>0) { URLObj urlObj =null; synchronized (Robot.urlNeedSearchList) { if (Robot.urlNeedSearchList.size()>0) { urlObj = Robot.urlNeedSearchList.get(0); Robot.urlNeedSearchList.remove(0); isHasNewUrl= true; } } if (isHasNewUrl) { go(urlObj); } } else { Thread.sleep(500); sleepTimes++; if (sleepTimes>sleepMaxTime) { break; } } } catch (Exception e) { e.printStackTrace(); } } } }

每一个关键字结构：每个关键字包含一个链表，每一项都是一个网页URL和对应此关键字出现的次数

package advance; public class KeyWord { String keyWordString ; public KeyWord(String keyString) { keyWordString = keyString; } KeyWordValue value; /** 也可以用对半查找。。。现在还没用 */ public void insertValue(KeyWordValue insertValue) { if (value== null) { value = insertValue; return; } KeyWordValue tempValue = value; if (tempValue.timeApperance< insertValue.timeApperance) { insertValue.nextValue = tempValue; value = insertValue; return; } KeyWordValue beforeValue = null; while (tempValue !=null && tempValue.timeApperance > insertValue.timeApperance ) { beforeValue = tempValue; tempValue = tempValue.nextValue; } beforeValue.nextValue = insertValue; if (tempValue != null) { insertValue.nextValue = tempValue; } } }

关键字链表项：

package advance; public class KeyWordValue { DictionaryNode leafNode; int timeApperance; public KeyWordValue() { } KeyWordValue nextValue; }

URL对象：存储字典树的叶子节点（从叶子节点回朔即可找到URL）

package advance; public class URLObj { DictionaryNode urlNode; int index = -1; boolean isAlreadyExist = false; public URLObj(DictionaryNode node , int index) { this.urlNode = node; this.index = index; } public URLObj() { } }

网页权值计算：方便替换新算法

package advance; public class WeightValue { public static void calculate(DictionaryNode source, DictionaryNode pointToSource) { source.weightValue += (int) Math.floor((pointToSource.weightValue/source.weightValue)); } }

完了，呵呵，注，现在关键字列表还没完成，只能放一个关键字，即如果放了很多关键字，只有第一个会起效，呵呵，因为最近在想关键字生成的问题，不能老是手动输入，太傻了，也跟张明哲，吴登容交流了下，也有一点想法，并且在做网站的时候，每个页面有个META DATA区，就是设置网页关键字的，方便搜索引擎的爬虫去找，呵呵，这一点先放下，先去看下LUCENE是怎么做的，呵呵