the5fire的技术博客

关注python、vim、linux、web开发和互联网--life is short, we need python.


java使用正则表达式抓取网页内容存为txt

作者:the5fire | 标签:     | 发布:2011-06-02 2:47 p.m. | 阅读量: 6442, 6370
前几天女友在网上看了一本电子书,想要下载下来,不过那个网站只能支持在线阅读,不提供下载,还好可以复制粘贴。
于是这个复制粘贴的任务便交给了我,看了一下网站url,单篇文章的html源码都很简单,作为一个程序员怎么可以重复的复制粘贴呢?
于是有了这个代码,比较简单:

package WEB;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 网页抓取
* @author 胡阳
* @blog http://www.the5fire.com
*
*/
public class WebGet {
private String myUrl;
private HttpURLConnection con;
private StringBuilder contextAll = new StringBuilder("");

private int pageCount = 0;
private String pageType = "";
public WebGet() {

}

public WebGet(String url) {
this.myUrl = url;
}

public WebGet(String url,int pageCount,String pageType) {
this.myUrl = url;
this.pageCount = pageCount;
this.pageType = pageType;
}

/**
* 正则表达式
* */
public String regex() {
String googleRegex = "";
return googleRegex;
}

public void init(String url, String page) throws IOException {
this.myUrl = "http://www.tianyabook.com/qita/hougeixue/";
this.init(page);
}

public void init(String page) throws IOException {
if (myUrl != null && !myUrl.equals("")) {
URL urlmy = new URL(myUrl + page + ".html");
con = (HttpURLConnection) urlmy.openConnection();
con.setFollowRedirects(true);
con.setInstanceFollowRedirects(false);
con.connect();
}
}

/**
* 写字符串中数据到txt文件
* @param context
* @return
* @throws IOException
*/
public boolean writeTxt(String context,String filePath) throws IOException {
System.out.println("开始写文件。。");
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(
filePath));

osw.write(context, 0, context.length());
osw.flush();
osw.close();

return true;
}

/**
* 获得网页内容,要指定编码格式
* @param codeType GB2312/UTF-8/……
* @return
* @throws IOException
* @throws
*/
public String getContent(String codeType) throws IOException{
if(pageCount < 1){
return "null";
}
System.out.println("开始抓取内容。。。。。");
for (int i = 1; i < pageCount; i++) {
System.out.println("抓取第 " + i + "页");
this.init(String.valueOf(i));
BufferedReader br = new BufferedReader(new InputStreamReader(con
.getInputStream(), codeType));
String s = "";
StringBuffer sb = new StringBuffer("");
while ((s = br.readLine()) != null) {
sb.append(s);
}

String result = sb.toString();
Pattern pattern = Pattern.compile(regex());
Matcher matcher = pattern.matcher(result);

while (matcher.find()) {
String title = matcher.group().replaceAll("<.*?>", "")
.replaceAll(" ", "");

contextAll.append(title + "\n\t");
}
System.out.println("完成:" + i + "页");
System.out.println("");
}

return contextAll.toString();
}

public static void main(String[] args) throws IOException {

WebGet wg = new WebGet("http://www.tianyabook.com/qita/hougeixue/",227,"html");
try {
if (wg.writeTxt(wg.getContent("GB2312"),"D:\\houhei.txt")) {
System.out.println("完成");
}
} catch (Exception e) {
e.printStackTrace();
}

}
}


----EOF-----

扫码关注,或者搜索微信公众号:码农悟凡


其他分类: