一:Java正则表达式的基础用法
package com.kawa.regex;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegexDemo01 {
public static void main(String[] args) {
replace("\\d", "dsfd;sa;ksd12a34b567c890d888e999f", "*");
//System.out.println(validate("\\d+","334455aaa"));
//System.out.println(validate2("\\d+","334455aaa"));
//System.out.println(search("(\\d+)([a-z]+)","334455aaa--3232423aaa-32324bbb"));
//System.out.println(searchSubStr("(\\d+)([a-z]+)","334455aaa--3232423aaa-32324bbb"));
testSplit();
}
/**
* 验证传入的字符串是否整个匹配正表达式
* @param regex: 正则表达式
* @param decStr:要匹配的字符串
* @return :若匹配,则返回true;否则,返回false;
*/
public static boolean validate(String regex, String decStr) {
// 表达式对象
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); //匹配模式 忽略大小写
// 创建 Matcher 对象
Matcher m = p.matcher(decStr);
// 是否完全匹配
boolean yesorno = m.matches(); // 该方法尝试将整个输入序列与该模式匹配
return yesorno;
}
/**
* 验证传入的字符串是否有子字符串匹配正表达式
* @param regex: 正则表达式
* @param decStr:要匹配的字符串
* @return :若匹配,则返回true;否则,返回false;
*/
public static boolean validate2(String regex, String decStr) {
// 表达式对象
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
// 创建 Matcher 对象
Matcher m = p.matcher(decStr);
// 是否完全匹配
boolean yesorno = m.find(); // 该方法扫描输入序列以查找与该模式匹配的下一个子序列。
return yesorno;
}
/**
* 给定字符串中是否有符合给定正则表达式的子字符串,返回匹配的第一个子字符串
* @param regex:正则表达式
* @param decStr:要匹配的字符串
* @return :返回匹配的第一个字符串,若不匹配则null
*/
public static String search(String regex, String decStr) {
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(decStr);
// 是否找到匹配
boolean found = m.find(); // 该方法扫描输入序列以查找与该模式匹配的下一个子序列。
String foundstring = "";
if (found) {
foundstring = m.group();
String foundstring0 = m.group(0); // group(),group(0)返回符合整个表达式的子字符串
String foundstring1 = m.group(1); // group(1)返回符合整个表达式的子字符串中匹配第一个表达式的子字符串
String foundstring2 = m.group(2); // group(2)返回符合整个表达式的子字符串中匹配第二个表达式的子字符串
//String foundstring3 = m.group(3); //查找不到包数组越界异常
System.out.println("foundstring:" + foundstring);
System.out.println("foundstring0:" + foundstring0);
System.out.println("foundstring1:" + foundstring1);
System.out.println("foundstring2:" + foundstring2);
//System.out.println("foundstring3:"+foundstring3);
}
return foundstring;
}
/**
* 返回给定字符串中匹配给定正则表达式所有子字符串
*
* @param regex
* @param decStr
* @return List:返回所有匹配正则表达式的子字符串
*/
public static List searchSubStr(String regex, String decStr) {
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(decStr);
List<String> list = new ArrayList<String>();
while (m.find()) { // 12a345b666cwer
list.add(m.group());
}
for (String string : list) {
System.out.println(string);
}
return list;
}
/**
* 替换给定字符串中匹配正则表达式的子字符串
*
* @param regex:正则表达式
* @param decStr:所要匹配的字符串
* @param replaceStr:将符合正则表达式的子串替换为该字符串
* @return:返回替换以后新的字符串
*/
public static String replace(String regex, String decStr, String replaceStr) {
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(decStr);
//输出原字符串
System.out.println(decStr);
// 替换
String newstring = m.replaceAll(replaceStr);
System.out.println(newstring);
return newstring;
}
public static void testSplit() {
String str = "abc5Adefghi7Ajklmn";
// 分割
String[] strs = str.split("(\\d)A");
for (int i = 0; i < strs.length; i++) {
System.out.println(strs[i]);
}
}
}
二:java正则表达式抓取文本中的指定信息
package com.kawa.regex;
import java.io.*;
import java.util.regex.*;
public class GetEmailByFile {
public static void main(String[] args)throws Exception {
File file = new File("D:\\email.txt");
//FileInputStream is =new FileInputStream(file);
BufferedReader br = new BufferedReader(new FileReader(file));
String reg = "\\d+@qq.com";
Pattern p = Pattern.compile(reg,Pattern.CASE_INSENSITIVE);
String line = br.readLine();
while(line!=null){
//System.out.println(line);
Matcher m = p.matcher(line);
while(m.find()){
System.out.println(m.group());
}
line = br.readLine();
}
br.close();
}
}
示例文本:email.txt
641785824@qq.com 878234160@qq.com
916951890@qq.com 445349623@qq.com
449272633@qq.com 348408109@qq.com
942075561@qq.com 447151580@qq.com
1162650614@qq.com
1392463323
三:java小爬虫 通过正则表达式抓取网页中的邮箱和URL
package com.kawa.regex;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.*;
public class RegexDemo03 {
public static void main(String args[]) {
//匹配邮箱的正则表达式
String regex = "[\\w!#$%&'*+/=?^_`{|}~-]+(?:\\.[\\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\\w](?:[\\w-]*[\\w])?\\.)+[\\w](?:[\\w-]*[\\w])?";
//匹配URL的正则表达式
regex = "(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?";
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
URL url;
int responsecode;
HttpURLConnection urlConnection;
BufferedReader reader;
String line;
try {
// 生成一个URL对象,要获取源代码的网页地址为:http://www.baidu.com
url = new URL("http://www.baidu.com");
// 打开URL
urlConnection = (HttpURLConnection) url.openConnection();
// 获取服务器响应代码
responsecode = urlConnection.getResponseCode();
if (responsecode == 200) {
// 得到输入流,即获得了网页的内容
reader = new BufferedReader(new InputStreamReader(
urlConnection.getInputStream(), "UTF-8"));
while ((line = reader.readLine()) != null) {
//System.out.println(line);
Matcher m = p.matcher(line);
while(m.find()){
System.out.println(m.group());
}
}
} else {
System.out.println("获取不到网页的源码,服务器响应代码为:" + responsecode);
}
} catch (Exception e) {
System.out.println("获取不到网页的源码,出现异常:" + e);
}
}
}




