作者:jostree
链接: http://www.cnblogs.com/jostree/p/4374404.html
U-00000000 – U-0000007F: 0xxxxxxxU-00000080 – U-000007FF: 110xxxxx 10xxxxxxU-00000800 – U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxxU-00010000 – U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxxU-00200000 – U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxU-04000000 – U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2.读取GBK系列文本原理
3.C++代码实现
#ifndef TEXT_H#define TEXT_H#include <iostream>#include <fstream>using namespace std;class Text{protected:char * m_binaryStr;size_t m_length;size_t m_index;public:Text(string path);void SetIndex(size_t index);virtual bool ReadOneChar(string &oneChar) = 0;size_t Size();virtual ~Text();};#endif
#include "Text.h"using namespace std;Text::Text(string path):m_index(0){filebuf *pbuf;ifstream filestr;// 采用二进制打开filestr.open(path.c_str(), ios::binary);if(!filestr){cerr<<path<<" Load text error."<<endl;return;}// 获取filestr对应buffer对象的指针pbuf=filestr.rdbuf();// 调用buffer对象方法获取文件大小m_length=(int)pbuf->pubseekoff(0,ios::end,ios::in);pbuf->pubseekpos(0,ios::in);// 分配内存空间m_binaryStr = new char[m_length+1];// 获取文件内容pbuf->sgetn(m_binaryStr,m_length);//关闭文件filestr.close();}void Text::SetIndex(size_t index){m_index = index;}size_t Text::Size(){return m_length;}Text::~Text(){delete [] m_binaryStr;}
#ifndef GBKTEXT_H#define GBKTEXT_H#include <iostream>#include <string>#include "Text.h"using namespace std;class GbkText:public Text{public:GbkText(string path);~GbkText(void);bool ReadOneChar(string & oneChar);};#endif
#include "GbkText.h"GbkText::GbkText(string path):Text(path){}GbkText::~GbkText(void) {}bool GbkText::ReadOneChar(string & oneChar){// return true 表示读取成功,// return false 表示已经读取到流末尾if(m_length == m_index)return false;if((unsigned char)m_binaryStr[m_index] < 0x81){oneChar = m_binaryStr[m_index];m_index++;}else{oneChar = string(m_binaryStr, 2);m_index += 2;}return true;}
#ifndef UTFTEXT_H#define UTFTEXT_H#include <iostream>#include <string>#include "Text.h"using namespace std;class UtfText:public Text{public:UtfText(string path);~UtfText(void);bool ReadOneChar(string & oneChar);private:size_t get_utf8_char_len(const char & byte);};#endif
#include "UtfText.h"UtfText::UtfText(string path):Text(path){}UtfText::~UtfText(void) {}bool UtfText::ReadOneChar(string & oneChar){// return true 表示读取成功,// return false 表示已经读取到流末尾if(m_length == m_index)return false;size_t utf8_char_len = get_utf8_char_len(m_binaryStr[m_index]);if( 0 == utf8_char_len ){oneChar = "";m_index++;return true;}size_t next_idx = m_index + utf8_char_len;if( m_length < next_idx ){//cerr << "Get utf8 first byte out of input src string." << endl;next_idx = m_length;}//输出UTF-8的一个字符oneChar = string(m_binaryStr + m_index, next_idx - m_index);//重置偏移量m_index = next_idx;return true;}size_t UtfText::get_utf8_char_len(const char & byte){// return 0 表示错误// return 1-6 表示正确值// 不会 return 其他值//UTF8 编码格式:// U-00000000 - U-0000007F: 0xxxxxxx// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxsize_t len = 0;unsigned char mask = 0x80;while( byte & mask ){len++;if( len > 6 ){//cerr << "The mask get len is over 6." << endl;return 0;}mask >>= 1;}if( 0 == len){return 1;}return len;}
#ifndef TEXTFACTORY_H#define TEXTFACTORY_H#include <iostream>#include "Text.h"#include "UtfText.h"#include "GbkText.h"using namespace std;class TextFactory{public:static Text * CreateText(string textCode, string path);};#endif
#include "TextFactory.h"#include "Text.h"Text * TextFactory::CreateText(string textCode, string path){if( (textCode == "utf-8")|| (textCode == "UTF-8")|| (textCode == "ISO-8859-2")|| (textCode == "ascii")|| (textCode == "ASCII")|| (textCode == "TIS-620")|| (textCode == "ISO-8859-5")|| (textCode == "ISO-8859-7") ){return new UtfText(path);}else if((textCode == "windows-1252")|| (textCode == "Big5")|| (textCode == "EUC-KR")|| (textCode == "GB2312")|| (textCode == "ISO-2022-CN")|| (textCode == "HZ-GB-2312")|| (textCode == "gb18030")){return new GbkText(path);}return NULL;}
#include <stdio.h>#include <string.h>#include <iostream>#include "Text.h"#include "TextFactory.h"#include "CodeDetector.h"using namespace std;int main(int argc, char *argv[]){string path ="日文";string code ="utf-8";Text * t = TextFactory::CreateText(code, path);string s;while(t->ReadOneChar(s)){cout<<s;}delete t;}
- EOF -


扫码关注
图文:龙小
排版:龙小
点赞和在看就是最大的支持❤️
文章转载自CPP开发前沿,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




