0% found this document useful (0 votes)
550 views12 pages

Web Grabber: Java-Based Site Scraper

This document contains source code for a Java application called Web Grabber. It includes classes for parsing HTML, grabbing links and images from a website, and downloading the content. Screenshots of the application interface are also included in an appendix. The source code appendix outlines classes for parsing HTML, grabbing website content, and a test method for grabbing links and files at different levels of the site.

Uploaded by

Sreekanth Jayan
Copyright
© Attribution Non-Commercial (BY-NC)
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
550 views12 pages

Web Grabber: Java-Based Site Scraper

This document contains source code for a Java application called Web Grabber. It includes classes for parsing HTML, grabbing links and images from a website, and downloading the content. Screenshots of the application interface are also included in an appendix. The source code appendix outlines classes for parsing HTML, grabbing website content, and a test method for grabbing links and files at different levels of the site.

Uploaded by

Sreekanth Jayan
Copyright
© Attribution Non-Commercial (BY-NC)
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd

Web Grabber

BIBLIOGRAPHY

1. An integrated approach to software engineering – Pankaj Jolote

2. System Analysis and design – Elias M Award

3. Elements of System Analysis – Marvin Gore & John W Stubbe

4. JAVA for dummies

5. JAVA 2: Complete Reference –Herbert schildt

6. Thinking in Java - Bruce Eckel

7. Wikipedia – [Link]

Dept. of Computer Science, Don Bosco College


Web Grabber

Appendix
Screenshots
Web Grabber – Main window

Web Grabber – Enter URL

Dept. of Computer Science, Don Bosco College


Web Grabber

Web Grabber – Enter Level

Web Grabber – Start download

Dept. of Computer Science, Don Bosco College


Web Grabber

Web Grabber –Settings

Web Grabber – Start download

Dept. of Computer Science, Don Bosco College


Web Grabber

Web Grabber – Enter file URL

Web Grabber – Start download

Dept. of Computer Science, Don Bosco College


Web Grabber

Source Code (Java)


HTML parser class
package [Link];
import [Link];

import [Link];
import [Link];
import [Link];
import [Link];
import [Link];

import [Link];
import [Link];

import [Link];
import [Link];
import [Link];
import [Link];

public class ParseHtml {

[Link] doc ;
String siteURL;
public ParseHtml(String urls )
{
siteURL = urls ;
try {
URL url = new URL(urls);
print("Fetching %s...", [Link]());

doc = [Link](url, 10*1000);


}catch (Exception e) {
[Link]("Connection error!!!");
[Link]--;
[Link]();
}
}

public Document getDoc()


{
return doc;
}

public void saveHtml(String path , String content)


{
try{
PrintStream printStream = new PrintStream(path);
[Link](content);
[Link]();
}catch (Exception e) {
[Link]();
}

public String getHtmlCode(){

return [Link]();
}

Dept. of Computer Science, Don Bosco College


Web Grabber
public ArrayList<String> getLinks(){

try {

Elements links = [Link]("a[href]");

ArrayList<String> arr=new ArrayList<String>();

for (Element link : links) {

print("%s", [Link]("abs:href"), trim([Link](), 35));

if([Link]().equals(siteURL)){
[Link]([Link]("abs:href"));
}
}

return arr;

} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
}

return null;
}

public ArrayList<String> getImageJsLinks(){

try {

Elements media = [Link]("[src]");

ArrayList<String> arr=new ArrayList<String>();

print("\nMedia: (%d)", [Link]());

for (Element src : media) {

if ([Link]().equals("img")){
print(" * %s: <%s> %sx%s (%s)",
[Link](), [Link]("abs:src"), [Link]("width"), [Link]("height"),
trim([Link]("alt"), 20));
[Link]([Link]("abs:src"));

else{
print(" * %s: <%s>", [Link](), [Link]("abs:src"));
[Link]([Link]("abs:src"));

}
return arr;

} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
}

return null;

public ArrayList<String> getCss(){

try {

Dept. of Computer Science, Don Bosco College


Web Grabber
Elements imports = [Link]("link[href]");

ArrayList<String> import_array=new ArrayList<String>();

for (Element link : imports) {


print(" * %s <%s> (%s)", [Link](),[Link]("abs:href"),
[Link]("rel"));
import_array.add([Link]("abs:href"));
}
[Link](import_array);
return import_array;
} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
}
return null;

private void print(String msg, Object... args) {


}

private String trim(String s, int width) {


if ([Link]() > width)
return [Link](0, width-1) + ".";
else
return s;
}

Html Grabber class


package [Link];

import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];

import [Link];

import [Link];
import [Link];

public class Grabber {

DownloadThread downloadThread;

public static void grabTheSite(String url,boolean getImages,String savPath){


URL urls;
try {
urls = new URL(url); //sets the main url
[Link]("Grabbing :"+url);
String filename=[Link]([Link]('/'));
if([Link](".")){
if([Link]("?")){
filename=[Link](0,[Link]('?'));
}
if(![Link](".html")){
filename=filename+".html";
}

Dept. of Computer Science, Don Bosco College


Web Grabber
}
[Link](urls);
String savePath = savPath; //sets the folder path
String content;

ParseHtml parseHtml = new ParseHtml(url);

URL urlf=new URL([Link]().baseUri());


String folder = [Link](); //gets the foldername(domain name)

savePath = savePath + folder;


File baseFolder = new File(savePath);
[Link]();

//[Link]("first page only...");


String path = savePath + "/"+filename; //path of firstpage
ArrayList<String> cssLinks=new ArrayList<String>([Link]());
[Link](cssLinks);

content=[Link]();
content=saveCss(cssLinks, content, savePath);

if(getImages){

ArrayList<String>imageLinks=
new ArrayList<String>([Link]());
[Link](imageLinks);

content=saveImages(imageLinks, content, savePath);

[Link](path, content);

} catch (MalformedURLException e) {
// TODO Auto-generated catch block
[Link]();
}

public static String saveImages(ArrayList<String> imgUrls , String content ,


String savePath)
{
HashSet<String> imgs = new HashSet<String>(imgUrls);

for(String s : imgs)
{
String temp = new String(s);
temp = [Link]([Link]("/"), [Link]());
try {
[Link](new URL(s), new File(savePath+"/images/"+temp));
} catch (Exception e) {

// TODO Auto-generated catch block


[Link]();

String t = "./images/"+temp ;
content = [Link](s, t);

[Link](s+ " " +temp);

Dept. of Computer Science, Don Bosco College


Web Grabber
[Link]("image:"+ temp);

return content;
}

public static String saveCss(ArrayList<String> cssUrls , String content ,


String savePath)
{
HashSet<String> css = new HashSet<String>(cssUrls);

for(String s : css)
{
String temp = new String(s);
temp = [Link]([Link]("/"), [Link]());
try {
[Link](new URL(s), new File(savePath+temp));
} catch (Exception e) {

// TODO Auto-generated catch block


[Link]();

String t = "./"+temp ;
content = [Link](s, t);

[Link](s+ " " +temp);


[Link]("css:" + s);

return content;
}

public void test(int level,String url,String path,boolean getImages ,


DownloadThread thread){

ParseHtml parseHtml=new ParseHtml(url);


HashSet<String> pageLinks=new HashSet<String>([Link]());
if(level==0){
grabTheSite(url, getImages, path);
}
if(level==1){
if([Link] == false){
[Link]("Entering level 1 strategy...");
[Link]("Entering level 1 strategy...");

HashSet<String> cssLinks=new HashSet<String>([Link]());


ArrayList<String> subLinks=new ArrayList<String>();

[Link]("Initialization success....");
[Link]("Initialization success....");

for(String link:pageLinks){
try{
[Link]("getting sublinks of::"+link);
[Link]("getting sublinks of::"+link+" ");
ParseHtml subParseHtml=new ParseHtml(link);
[Link]([Link]());

Dept. of Computer Science, Don Bosco College


Web Grabber
}catch(Exception e){
[Link]("Dead link...!");
[Link]("Dead link...!");
[Link]();
}
}
[Link](subLinks);
}else
{
while([Link] == true){
try {
[Link]().sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
[Link]();
}
[Link]("Thread sleeping");
}
}

for(String link:pageLinks){
try{
if([Link] == false){
grabTheSite(link, getImages, path); }
else
{
while([Link] == true){
[Link]().sleep(1000);
[Link]("Thread sleeping");
}
}
}catch (Exception e) {
[Link]("Error grabbing page :" +link);
[Link]("Error grabbing page :" +link);
}
}
}
if(level==2){
[Link]("Entering level 2 strategy...");
[Link]("Entering level 1 strategy...");

HashSet<String> cssLinks=new HashSet<String>([Link]());


ArrayList<String> subLinks=new ArrayList<String>();

[Link]("Initialization success....");
[Link]("Initialization success....");

for(String link:pageLinks){
try{
[Link]("getting sublinks of::"+link);
[Link]("getting sublinks of::"+link+" ");
ParseHtml subParseHtml=new ParseHtml(link);
[Link]([Link]());
[Link]();
}catch(Exception e){
[Link]("Dead link...!");
[Link]("Dead link...!");
//[Link]();
}
}
[Link](subLinks);

for(String link:pageLinks){
try{
ParseHtml lev2parse=new ParseHtml(link);
[Link]([Link]());
}catch (Exception e) {
[Link]("dead link...");
[Link]("Dead link...! ");
}

Dept. of Computer Science, Don Bosco College


Web Grabber
}

[Link](subLinks);

for(String link:pageLinks){
try{
grabTheSite(link, getImages, path);
}catch (Exception e) {
[Link]("Error grabbing page :" +link);
[Link]("Error grabbing page :" +link+" ");
}
}

}
}

public static void saveFile(String url,String path){


String extension=[Link]([Link](".")-1,[Link]());
String filename=[Link]([Link]('/')+1,[Link]());
if([Link]("?")){
filename=[Link]([Link]("?")+1,[Link]());
}
if([Link]("%")){
filename=[Link]([Link]("%")+1,[Link]());
}
filename="WG"+filename;
try {
[Link]("Downloading :" +filename);
[Link]("Download started!"+ filename);
[Link](new URL(url), new File(path+filename));
[Link]("Finished downloading :" + filename);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
[Link]();
} catch (IOException e) {
// TODO Auto-generated catch block
[Link]();
}
}

Dept. of Computer Science, Don Bosco College

You might also like