/*
 *	clipbpe.c
 *
 *	Byte Pair Encoding
 *
 *	CLiP - Common Library for P/ECE
 *	Copyright (C) 2015 Naoyuki Sawa
 *
 *	* Thu Jan 29 21:20:19 JST 2015 Naoyuki Sawa
 *	- 1st [XB
 *	- L̎QlɂĒ܂B
 *	  uVector - bpe - BPEk/WJR}hCc[v(http://www.vector.co.jp/soft/dl/winnt/util/se498916.html)
 *	  uЂɂ߂炵 - Byte Pair EncodingÊPv(http://musyozoku211.blog118.fc2.com/blog-entry-8.html)
 *	* Fri Jan 30 21:08:25 JST 2015 Naoyuki Sawa
 *	- uPoCgÂWJ֐(WJXg[)v邽߂ɁA啝ȕύXs܂B
 *	  f[^̓Ɋ֌WAuPoCgÂWJ֐(WJXg[)vɈ肵xœ삷悤ɂȂ܂B
 *	  ̕ύXɂāA͂Ɉkቺ܂AقƂǖł郌xƎv܂B
 *	* Mon Feb 02 00:23:19 JST 2015 Naoyuki Sawa
 *	- BytePairEncodingEncoder_Encode()}`XbhɑΉ܂B
 *	  Visual C++ 6.0́uvWFNg̐ݒv́u^CCuvɁu}`XbhvIƁA}`Xbhłgp܂B
 *	  Visual C++ 6.0́uvWFNg̐ݒv́u^CCuvɁuVOXbhvIƁA܂łƓʏłgp܂B
 *	* Mon Feb 02 21:59:02 JST 2015 Naoyuki Sawa
 *	- }`XbhłBytePairEncodingEncoder_Encode()(srcLen=1)̏ꍇɏIȂoOC܂B
 *	* Tue Nov 03 21:42:14 JST 2015 Naoyuki Sawa
 *	- uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂B
 *	  ڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
 *	* Sat Nov 14 21:55:38 JST 2015 Naoyuki Sawa
 *	- }`Xbhł̎C܂BȕύX_́Aȉ̒ʂłB
 *	- BytePairEncodingEncoder_Encode()'CloseHandle(stMT.hEvent)'ABytePairEncodingEncoder_EncodeTry()̍Ō'SetEvent(pMT->hEvent)'ɍs\LA
 *	  CloseHandle()SetEvent()s\̗LoOL܂B{A}`Xbhł̎CāA̕s͖Ȃ܂B
 *	  L̕sC܂߂āA}`Xbhł̏eāA܂B
 *	- ܂ł́AvWFNg̐ݒɂāAVOXbh(ʏ)ƁA}`XbhłAIɐ؂ւărhĂ܂B
 *	  ́AVOXbhł̊֐(BytePairEncodingEncoder_EncodeST)ƃ}`Xbhł̊֐(BytePairEncodingEncoder_EncodeMT)AIɌĂѕ悤ɂ܂B
 *	  VOXbhłƃ}`XbhłĂѕ悤ɂŔA/clip/tool/dpbpe/main.c ̓̃RgQƂĉB
 *	  ABytePairEncodingEncoder_Encode}N`āAIȐ؂ւɂΉĂ܂̂ŁÃAvP[Ṽ\[XύXKv͗L܂B
 */
#include "clip.h"

/****************************************************************************
 *	BytePairEncodingGR[_[
 ****************************************************************************/
#ifdef  WIN32
#define BYTEPAIRENCODINGENCODER_PRINTPROGRESS //̃V{`BytePairEncodingEncoder_Encode()WG[o͂ɐi\\B̍sRgAEgΕ\ȂB
#endif//WIN32
typedef struct _ST_BytePairEncodingEncoder {
	int		valueCount[65536];	//,,yȀo񐔁B̏o񐔂𐔂鎞[0]`[255]gpB
	unsigned char	charToCode[256];	//畄ւ̕ϊ\B
	unsigned short	codeValue[256];		//當ւ̕ϊ\B[0]͎gpȂB[1]`[nChar]͕(8bit̂ݎgp)B[nChar+1]`[nCode]̓yA(8bit,8bit̏)B
	unsigned char	codeDepth[256];		//̐[B̐[Ƃ͂̕WJ鎞̃X^bNʂłB̕Ȃ0ByA(x,y)̕Ȃmax(x̐[,y̐[)+1B		yAɊ蓖Ă镄͍̐ő(254-1)Ȃ̂ŕ̐[̍ől254ł(unsigned char)őB
	unsigned char	src[0/*srcLenȏ*/];	//kf[^̃Rs[쐬邽߂̃obt@B
} ST_BytePairEncodingEncoder;
/*--------------------------------------------------------------------------*/
//BytePairEncodingEncoder_Encode()Ăяo[J֐
static int BytePairEncodingEncoder_EncodeSub(ST_BytePairEncodingEncoder* pEncoder, const void* _src, int srcLen, void* _dst, int dstCap) {
	unsigned char* src = memcpy(pEncoder->src, _src, srcLen);	//kf[^̃Rs[쐬B
	unsigned char* dst = _dst;
	int dstLen = -1;	//'ks'Ɣfꍇ-1Ԃ߂ɁAkf[^-1ƂĂB
	int c, lo, hi, iSrc, nChar, nCode;
	//̏o񐔂𐔂B
	memset(pEncoder->valueCount, 0, sizeof pEncoder->valueCount);	//o񐔂NAB
	for(iSrc = 0; iSrc < srcLen; iSrc++) {
		c = src[iSrc];
		pEncoder->valueCount[c]++;
	}
	//ɕ蓖ĂB
	nCode = 1;	//蓖Ă̐B[0]͎gpȂB[1]JnB
	for(c = 0; c < 256; c++) {
		if(pEncoder->valueCount[c]) {
			if(nCode >= 256) { return -1; }		//ɕ蓖ĂȂ'ks'ƔfBSĂ̕oꍇɂȂB	'ks'Ɣfꍇ-1ԂB
			pEncoder->charToCode[c] = nCode;	//畄ւ̕ϊ\i[B
			pEncoder->codeValue[nCode] = c;		//當ւ̕ϊ\i[B
			nCode++;				//蓖Ă̐𑝂₷B
		}
	}
	nChar = nCode;	//[0],y,Ɋ蓖Ă̐ۑB
	//𕄍ɒuB
	for(iSrc = 0; iSrc < srcLen; iSrc++) {
		c = src[iSrc];
		c = pEncoder->charToCode[c];
		src[iSrc] = c;
	}
	//--- ȏoyAA̕ɒu鏈 ---
	memset(pEncoder->codeDepth, 0, sizeof pEncoder->codeDepth);	//̐[NAB
	while(nCode < 256) {	//yAɊ蓖Ă镄cĂ΁c
		int cPrev, maxPair, maxCount;
		//yȀo񐔂𐔂B
		memset(pEncoder->valueCount, 0, sizeof pEncoder->valueCount);	//o񐔂NAB
		cPrev = -1;						//
		for(iSrc = 0; iSrc < srcLen - 1; iSrc++) {		//@'AAA' ̏ꍇAO'AA_' ͐B㔼'_AA' ͐ȂB
			c = src[iSrc] | (src[iSrc+1]<<8);		//@'AAAA'̏ꍇAO'AA__'͐B'_AA_'͐ȂB㔼'__AA'͐B
			cPrev = (cPrev != c) ? c : -1;			//ŔA''yAł́A'Ȃ'̓yAłȂłB
			if(c != -1) { pEncoder->valueCount[c]++; }	//
		}
		//o񐔂őłyA擾B
		maxPair = maxCount = 0;
		for(c = 0; c < 65536; c++) {
			lo = (unsigned char)(c>>0);	//8bit̕
			hi = (unsigned char)(c>>8);	//8bit̕
			lo = pEncoder->codeDepth[lo];	//8bit̐̕[
			hi = pEncoder->codeDepth[hi];	//8bit̐̕[
			if(max(lo,hi) < FIELD_SIZEOF(ST_BytePairEncodingDecoder,stk)) {	//̐̕[Ȃ΃yA\łB
				if(pEncoder->valueCount[c] > maxCount) {
					maxPair  = c;
					maxCount = pEncoder->valueCount[c];
				}
			}
		}
		//ȏoyA΁Ak͂܂łƂB
		if(maxCount < 2) { break; }
		//yA𕄍ɒuB
		for(iSrc = 0, dstLen = 0; iSrc < srcLen; iSrc++, dstLen++) {
			c = src[iSrc];
			if(iSrc < srcLen - 1) { c |= (src[iSrc+1]<<8); }	//ʂȂΏ8bit=0ƂȂARmaxPairɈvSz͖B[0]͎gpȂ̂ŁAyȀ8bit=0ł鎖͖łB
			if(c == maxPair) { c = nCode; iSrc++; }
			src[dstLen] = c;
		}
		srcLen = dstLen;
		//yAɕ蓖ĂB
		pEncoder->codeValue[nCode] = maxPair;		//yAւ̕ϊ\i[B
		lo = (unsigned char)(maxPair>>0);		//8bit̕
		hi = (unsigned char)(maxPair>>8);		//8bit̕
		lo = pEncoder->codeDepth[lo];			//8bit̐̕[
		hi = pEncoder->codeDepth[hi];			//8bit̐̕[
		pEncoder->codeDepth[nCode] = max(lo,hi) + 1;	//̐[i[B
		nCode++;					//蓖Ă̐𑝂₷B
	}
	//--- kf[^o͂鏈 ---
	dstLen = 0;
	//o̓obt@w肳Ăc
	if(dstCap) {	//̏fsȂĂ͖薳B̂߂łB
		//ϊ\o͂B
		if(dstLen < dstCap) { *dst++ = (nChar -     1); } dstLen++;	//Ɋ蓖Ă̐o͂B
		if(dstLen < dstCap) { *dst++ = (nCode - nChar); } dstLen++;	//yAɊ蓖Ă̐o͂B
		for(c = 1; c < nCode; c++) {					//[0]ȊȎSĂ̕ɂāc
			if(c < nChar) {	//Ɋ蓖ĂꂽȂ΁c
				if(dstLen < dstCap) { *dst++ = (unsigned char) pEncoder->codeValue[c]    ; } dstLen++;	//o͂B
			} else {	//yAɊ蓖ĂꂽȂ΁c
				if(dstLen < dstCap) { *dst++ = (unsigned char)(pEncoder->codeValue[c]>>8); } dstLen++;	//yǍ㑱o͂B	//fR[_œK邽߂ɃyA̕ϊ\{㑱,s}̏ƂB
				if(dstLen < dstCap) { *dst++ = (unsigned char)(pEncoder->codeValue[c]>>0); } dstLen++;	//yA̐so͂B	//frambpe.cBytePairEncodingDecoder_Getc()̎QƂB
			}
		}
		//o͂B
		for(iSrc = 0; iSrc < srcLen; iSrc++) {
			if(dstLen < dstCap) { *dst++ = src[iSrc]; } dstLen++;
		}
		//I[([0])o͂B
		if(dstLen < dstCap) { *dst++ = 0; } dstLen++;
	//o̓obt@w肳ĂȂ΁c
	} else {
		dstLen = 1			//Ɋ蓖Ă̐
		       + 1			//yAɊ蓖Ă̐
		       + (nCode -     1)	//,,yẢ8bit
		       + (nCode - nChar)	//          yȀ8bit
		       + srcLen			//
		       + 1;			//I[([0])
	}
	//kf[^65536oCg𒴉߂ꍇ'ks'Ɣf-1ԂB
	// - kf[^65536oCg𒴉ߕsƂ闝ŔAST_BytePairEncodingDecoder.ofs65536\oȂłB
	// - ɂ͂҂65536̏ꍇST_BytePairEncodingDecoder.ofsŕ\o܂񂪁A҂65536̏ꍇ͉Ƃ܂B
	//   ȂȂI[([0])ǂݏoofslAST_BytePairEncodingDecoder.ofsɊi[邱Ƃ͖łB
	//   ̓Iɂ́ABytePairEncodingDecoder_Getc()̎QƂĂB
	if(dstLen > 65536) { return -1; }
	//kf[^ԂB
	return dstLen;
}
/*--------------------------------------------------------------------------*/
//Ŝxňk֐
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
//VOXbh	ۼުĂ̐ݒ裁ˢC/C++ˢސˢgpײأˢݸٽگޣ/ݸٽگ(ޯ)I͂gp܂
int BytePairEncodingEncoder_EncodeST(const void* _src, int srcLen, void* _dst, int dstCap) {
	const unsigned char* src = _src;
	      unsigned char* dst = _dst;
	int dstLen = 0;	//(srcLen=0)ꍇ̂߂(dstLen=0)ƂĂB
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
	int newMsgLen, oldMsgLen = 0;
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
	//kf[^łȂ΁c
	if(srcLen) {
		int iBlk, divCnt, blkPos, blkLen, tmpLen;
		//[NmۂB
		ST_BytePairEncodingEncoder* pEncoder = malloc(sizeof(ST_BytePairEncodingEncoder) + srcLen);
		if(!pEncoder) { DIE(); }
		//--- œKȕ߂鏈 ---
		{
			int minDivCnt = 1;	//kf[^ŏƂȂ镪
			int minDstLen = -1;	//kf[^
			//kf[^ŏƂȂ镪ƁAkf[^߂B
			// - eubNɑ΂ċAčœKȕ߂@ČAxĒfOB
			//   SLoCgx̃f[^k悤Ƃ邾łAIȂ炢ɒxȂĂ܂B
			//   dAiK̕ɂBt@C̑Ŝ̌Xł΁Ałǂ낤B
			for(divCnt = 1; divCnt <= srcLen; divCnt <<= 1) {
				dstLen = 0;
				for(iBlk = 0; iBlk < divCnt; iBlk++) {
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
					newMsgLen = fprintf(stderr, "\rtry srcLen=%d minDstLen=%d minDivCnt=%d divCnt=%d (%d/%d)", srcLen, minDstLen, minDivCnt, divCnt, iBlk+1, divCnt);
					if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } oldMsgLen = newMsgLen;
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
					//̃ubN̈kf[^߁AZB
					blkPos = (int)((int64_t)srcLen * (int64_t)(iBlk + 0) / (int64_t)divCnt);		//̓švZint64_t^ւ̃LXgK{łBint^̂܂܌vZƁA
					blkLen = (int)((int64_t)srcLen * (int64_t)(iBlk + 1) / (int64_t)divCnt) - blkPos;	//MBȏ̃t@Ck悤ƂɃI[o[t[ĈُI܂B
					tmpLen = BytePairEncodingEncoder_EncodeSub(pEncoder, &src[blkPos], blkLen, NULL, 0);
					if(tmpLen < 0) { goto L_NEXT; }	//'ks'Ȃ΁Ȁ̕ł؂Ȁ̕֐i߂B
					dstLen += tmpLen;
				}
				//kf[^Xɓ]AȏׂĂʂƔfāAł؂B
				if((unsigned)dstLen > (unsigned)minDstLen) { break; }
				//kf[^ŏƂȂ镪ƁAkf[^XVB
				minDivCnt = divCnt;	//kf[^ŏƂȂ镪
				minDstLen = dstLen;	//kf[^
			L_NEXT:;//'ks'Ɣfꍇ̕ʒu
			}
			if(minDstLen < 0) { DIE(); }	//(kf[^/255)ȉ̃ubNɕ΁A(TCYƂĂ)'k\'Ɣf͂Ȃ̂ŁAŌ܂'ks'Ɣf鎖͗L蓾ȂB
			divCnt = minDivCnt;	//
		}
		//--- kf[^o͂鏈 ---
		{
			dstLen = 0;
			for(iBlk = 0; iBlk < divCnt; iBlk++) {
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
				newMsgLen = fprintf(stderr, "\rout srcLen=%d dstLen=%d divCnt=%d (%d/%d)", srcLen, dstLen, divCnt, iBlk+1, divCnt);
				if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } oldMsgLen = newMsgLen;
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
				//̃ubNkB
				blkPos = (int)((int64_t)srcLen * (int64_t)(iBlk + 0) / (int64_t)divCnt);		//̓švZint64_t^ւ̃LXgK{łBint^̂܂܌vZƁA
				blkLen = (int)((int64_t)srcLen * (int64_t)(iBlk + 1) / (int64_t)divCnt) - blkPos;	//MBȏ̃t@Ck悤ƂɃI[o[t[ĈُI܂B
				tmpLen = BytePairEncodingEncoder_EncodeSub(pEncoder, &src[blkPos], blkLen, &dst[dstLen], dstCap - dstLen);
				if(tmpLen < 0) { DIE(); }	//'œKȕ߂鏈''k\'Ɣfꂽ̂ɁA'ks'Ɣf鎖͗L蓾ȂB
				dstLen += tmpLen;		//kf[^̏o̓TCY𑝂₷B
			}
		}
		//[NJB
		free(pEncoder);
	}
	//(Ɋ蓖Ă̐=0)o͂BfR[_͂āAI[ubNł鎖𔻒fB
	// - ōs'dstLen++'́A'œKȕ߂鏈'ɂ͊܂܂ĂȂ̂ŁAo̓obt@̗LɊ֌WsKvLB
	//   o̓obt@w肳ĂȂɂ̏sȂƁAo̓obt@w肳ĂȂ(-1)̌ʍقB
	if(dstLen < dstCap) { dst[dstLen] = 0; } dstLen++;
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
	newMsgLen = fprintf(stderr, "\rend srcLen=%d dstLen=%d", srcLen, dstLen);
	if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } putc('\n', stderr);
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
	//kf[^ԂB
	return dstLen;
}
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
//}`Xbh	ۼުĂ̐ݒ裁ˢC/C++ˢސˢgpײأˢگޣ/گ(ޯ)I͂gp܂
#if     defined(_WIN32) && defined(_MT)
//{{2015/11/14ύX:}`Xbhł̎C܂B
//typedef struct _ST_BytePairEncodingEncoderMT {
//	const unsigned char*	src/*[srcLen]*/;
//	int			srcLen;
//	int			divLev;				//Ɏs镪x-1	InterlockedIncrement()găANZXdivCnt=(1<<߂l)sB̖߂l0Ƃ邽߂-1ɏB
//	int			dstLen[32/*divLev*/];		//0=,-1=ks,>0=kf[^
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//	int			  iBlk[32/*divLev*/];
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//	HANDLE			hEvent;				//TuXbh\̂ύXACXbh֒ʒm邽߂̃Cxg
//	int			reqExit;			//CXbhATuXbhւ̏IvtO	TuXbh擾kf[^𒴉߂ꍇI܂B
//	int			numThread;			//IĂȂTuXbh̐	CNgTuXbhōsBSł闝RBytePairEncodingEncoder_Encode()́uTuXbh̎̏҂ṽRgQƂB
//} ST_BytePairEncodingEncoderMT;
////{{2015/11/03ύX:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
////static void BytePairEncodingEncoder_EncodeTry(volatile ST_BytePairEncodingEncoderMT* pMT) {
////2015/11/03ύX:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
//static unsigned __stdcall BytePairEncodingEncoder_EncodeTry(volatile ST_BytePairEncodingEncoderMT* pMT) {
////}}2015/11/03ύX:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
//	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
//	InterlockedIncrement((int*)&pMT->numThread);
//	{
//		const unsigned char* src    = pMT->src;
//		int                  srcLen = pMT->srcLen;
//		int iBlk, divLev, divCnt, dstLen, blkPos, blkLen, tmpLen;
//		//[NmۂB
//		ST_BytePairEncodingEncoder* pEncoder = malloc(sizeof(ST_BytePairEncodingEncoder) + srcLen);
//		if(!pEncoder) { DIE(); }
//		for(;;) {
//			divLev = InterlockedIncrement((int*)&pMT->divLev);	//Ɏs镪x擾B
//			divCnt = (1 << divLev);
//			if(divCnt > srcLen) { break; }				//TuXbh擾kf[^𒴉߂ꍇI܂B
//			dstLen = 0;
//			for(iBlk = 0; iBlk < divCnt; iBlk++) {
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//				pMT->iBlk[divLev] = iBlk;
//				SetEvent(pMT->hEvent);
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//				//̃ubN̈kf[^߁AZB
//				blkPos = (int)((int64_t)srcLen * (int64_t)(iBlk + 0) / (int64_t)divCnt);		//̓švZint64_t^ւ̃LXgK{łBint^̂܂܌vZƁA
//				blkLen = (int)((int64_t)srcLen * (int64_t)(iBlk + 1) / (int64_t)divCnt) - blkPos;	//MBȏ̃t@Ck悤ƂɃI[o[t[ĈُI܂B
//				tmpLen = BytePairEncodingEncoder_EncodeSub(pEncoder, &src[blkPos], blkLen, NULL, 0);
//				if(tmpLen < 0) { dstLen = -1; break; }
//				dstLen += tmpLen;
//				if(pMT->reqExit) { goto L_EXIT; }
//			}
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//			pMT->iBlk[  divLev] = iBlk;
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//			pMT->dstLen[divLev] = dstLen;
//			SetEvent(pMT->hEvent);
//		}
//L_EXIT:		//[NJB
//		free(pEncoder);
//	}
//	InterlockedDecrement((int*)&pMT->numThread);
//	SetEvent(pMT->hEvent);				//{{2015/11/14RgǋL:BytePairEncodingEncoder_Encode()'CloseHandle(stMT.hEvent)'ABytePairEncodingEncoder_EncodeTry()̍Ō'SetEvent(pMT->hEvent)'ɍs\LACloseHandle()SetEvent()s\̗LoOL܂B{A}`Xbhł̎CāA̕s͖Ȃ܂B}}
////{{2015/11/03ǉ:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
//	return 0;
////}}2015/11/03ǉ:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
//}
//int BytePairEncodingEncoder_Encode(const void* _src, int srcLen, void* _dst, int dstCap) {
//	const unsigned char* src = _src;
//	      unsigned char* dst = _dst;
//	int dstLen = 0;	//(srcLen=0)ꍇ̂߂(dstLen=0)ƂĂB
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//	int newMsgLen, oldMsgLen = 0;
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//	//kf[^łȂ΁c
//	if(srcLen) {
//		int iBlk, divCnt, blkPos, blkLen, tmpLen;
//		//[NmۂB
//		ST_BytePairEncodingEncoder* pEncoder = malloc(sizeof(ST_BytePairEncodingEncoder) + srcLen);
//		if(!pEncoder) { DIE(); }
//		//--- œKȕ߂鏈 ---
//		{
//			volatile ST_BytePairEncodingEncoderMT stMT;
//			SYSTEM_INFO si;
//			int i, divLev;
//			memset((void*)&stMT, 0, sizeof stMT);
//			stMT.src    = src;
//			stMT.srcLen = srcLen;
//			stMT.divLev = -1;			//Ɏs镪x-1	InterlockedIncrement()găANZXdivCnt=(1<<߂l)sB̖߂l0Ƃ邽߂-1ɏB
//			stMT.hEvent = CreateEvent(NULL, FALSE/*ZbgIuWFNg*/, FALSE/*VOi*/, NULL);
//		//{{2015/02/02ǉ:}`XbhłBytePairEncodingEncoder_Encode()(srcLen=1)̏ꍇɏIȂoOC܂B
//		// * Mon Feb 02 21:59:02 JST 2015 Naoyuki Sawa
//		// - }`XbhłBytePairEncodingEncoder_Encode()(srcLen=1)̏ꍇɏIȂoOC܂B
//		// - L̃oOĂR͈ȉ̒ʂłB
//		//   }`XbhłBytePairEncodingEncoder_Encode()́uœKȕ߂鏈v̏ÍAuׂ荇xk\őXɓ]ĂӏvłB
//		//   (srcLen=1)̏ꍇ́ABytePairEncodingEncoder_EncodeTry()kx0TȂ̂ŁAL̏𖞂ABytePairEncodingEncoder_Encode()܂ł҂Ă܂łB
//		// - L̃oOC邽߂ɁA(srcLen=1)̏ꍇ̂݁AIɕx0IᏈǉ܂B
//		// - (srcLen=0)̏ꍇ͏ŏOĂ̂ŁAōlKv͗L܂B
//		//   (srcLen=2)̏ꍇ͕x01T̂ŁAiǂ̏I𖞂ďÎŁAōlKv͗L܂B
//		// - Aŏqׂ΍̓}`Xbhłł̂ݕKvȑ΍łAʏłł(srcLen=1)ɂ͓Ꮘ͕Kvł܂B
//		//   ȂȂAʏł̒Tł؂́uׂ荇xk\őXɓ]Ăӏvł͂ȂA
//		//   uk\ȕɎčsđXɓ]Ƃvł̂ŁA(srcLen=1)̏ꍇiǂ̏Ŗ薳łB
//			if(srcLen == 1) { divLev = 0; goto L_EXIT; }
//		//}}2015/02/02ǉ:}`XbhłBytePairEncodingEncoder_Encode()(srcLen=1)̏ꍇɏIȂoOC܂B
//			GetSystemInfo(&si);
//			for(i = 0; (int)i < (int)si.dwNumberOfProcessors; i++) {			//_vZbT̐c
////{{2015/11/03ύX:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
////				_beginthread(BytePairEncodingEncoder_EncodeTry, 0, (void*)&stMT);	//TuXbh쐬B
////2015/11/03ύX:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
//				HANDLE hThread = (HANDLE)_beginthreadex(NULL, 0, BytePairEncodingEncoder_EncodeTry, (void*)&stMT, 0, NULL);
//				if(!hThread) { DIE(); }
//				CloseHandle(hThread);	//Xbh̏I҂̓CxggĂ̂ŃXbhnh͕KvB]ăXbh̏I҂ɃXbhnhč\ȂB
////}}2015/11/03ύX:uBoehm GCv}`XbhΉɂɔAW[_beginthread()_beginthreadex()ɕύX܂Bڍׂ́A/clip/tool/gc/README.TXT ̓̃RgQƂĉB
//			}
//			for(;;) {
//				WaitForSingleObject(stMT.hEvent, INFINITE);	//TuXbh̎̏҂B	ȂƂ͂ő҂̂ŁAstMT.numThread̏CNgTuXbhōsĈSłB
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//				divLev = stMT.divLev;
//				if((unsigned)divLev < 32) {	//(divLev<0),,(divLev>31)ɂȂ\L̂ŏO鎖YȂ悤ɁB
//					divCnt = (1 << divLev);
//					iBlk = stMT.iBlk[divLev];
//					newMsgLen = fprintf(stderr, "\rtry srcLen=%d divCnt=%d (%d/%d)", srcLen, divCnt, iBlk, divCnt);
//					if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } oldMsgLen = newMsgLen;
//				}
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//				for(divLev = 0; divLev < (32 - 1); divLev++) {		//ׂ荇xɂāc
//					int dstLen0 = stMT.dstLen[divLev + 0];		//(x+0)̈kf[^擾BAg~bNłȂč\Ȃ̂ŁAڎQƉ\łB
//					int dstLen1 = stMT.dstLen[divLev + 1];		//(x+1)̈kf[^擾B
//					if(!dstLen0 || !dstLen1) { break; }		//(x+0),,(x+1)ȂΔāATuXbh̎̏҂B
//					if((dstLen0 > 0) &&				//(x+0)\Łc
//					   (dstLen1 > 0) &&				//(x+1)\Łc
//					   (dstLen0 < dstLen1)) { goto L_EXIT; }	//kf[^Xɓ]AȏׂĂʂƔfāAł؂B
//				}
//			}
//L_EXIT:			stMT.reqExit = 1;							//{{2015/02/02Rgǉ:(srcLen=1)̏ꍇ̏番򂵂Ă͂̏̓_~[ƂȂB}}
//			while(stMT.numThread) { WaitForSingleObject(stMT.hEvent, INFINITE); }	//{{2015/02/02Rgǉ:(srcLen=1)̏ꍇ̏番򂵂Ă̓TuXbh쐬ĂȂ̂stMT.numThread͕ωĂ炸̏̓_~[ƂȂB}}
//			CloseHandle(stMT.hEvent);	//{{2015/11/14RgǋL:BytePairEncodingEncoder_Encode()'CloseHandle(stMT.hEvent)'ABytePairEncodingEncoder_EncodeTry()̍Ō'SetEvent(pMT->hEvent)'ɍs\LACloseHandle()SetEvent()s\̗LoOL܂B{A}`Xbhł̎CāA̕s͖Ȃ܂B}}
//			divCnt = (1 << divLev);
//		}
//		//--- kf[^o͂鏈 ---
//		{
//			dstLen = 0;
//			for(iBlk = 0; iBlk < divCnt; iBlk++) {
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//				newMsgLen = fprintf(stderr, "\rout srcLen=%d dstLen=%d divCnt=%d (%d/%d)", srcLen, dstLen, divCnt, iBlk+1, divCnt);
//				if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } oldMsgLen = newMsgLen;
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//				//̃ubNkB
//				blkPos = (int)((int64_t)srcLen * (int64_t)(iBlk + 0) / (int64_t)divCnt);		//̓švZint64_t^ւ̃LXgK{łBint^̂܂܌vZƁA
//				blkLen = (int)((int64_t)srcLen * (int64_t)(iBlk + 1) / (int64_t)divCnt) - blkPos;	//MBȏ̃t@Ck悤ƂɃI[o[t[ĈُI܂B
//				tmpLen = BytePairEncodingEncoder_EncodeSub(pEncoder, &src[blkPos], blkLen, &dst[dstLen], dstCap - dstLen);
//				if(tmpLen < 0) { DIE(); }	//'œKȕ߂鏈''k\'Ɣfꂽ̂ɁA'ks'Ɣf鎖͗L蓾ȂB
//				dstLen += tmpLen;		//kf[^̏o̓TCY𑝂₷B
//			}
//		}
//		//[NJB
//		free(pEncoder);
//	}
//	//(Ɋ蓖Ă̐=0)o͂BfR[_͂āAI[ubNł鎖𔻒fB
//	// - ōs'dstLen++'́A'œKȕ߂鏈'ɂ͊܂܂ĂȂ̂ŁAo̓obt@̗LɊ֌WsKvLB
//	//   o̓obt@w肳ĂȂɂ̏sȂƁAo̓obt@w肳ĂȂ(-1)̌ʍقB
//	if(dstLen < dstCap) { dst[dstLen] = 0; } dstLen++;
//#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
//	newMsgLen = fprintf(stderr, "\rend srcLen=%d dstLen=%d", srcLen, dstLen);
//	if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } putc('\n', stderr);
//#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
//	//kf[^ԂB
//	return dstLen;
//}
//2015/11/14ύX:}`Xbhł̎C܂B
typedef struct _ST_BytePairEncodingEncoderMT {
	const unsigned char*	src/*[srcLen]*/;
	int			srcLen;
	int			divLev;			//Ɏs镪x-1	InterlockedIncrement()găANZXdivCnt=(1<<߂l)sB̖߂l0Ƃ邽߂-1ɏB
	int			dstLen[32/*divLev*/];	//0=,-1=ks,>0=kf[^
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
	int			  iBlk[32/*divLev*/];
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
	HANDLE			hEvent[2];		//[0]=TuXbhdstLen[*]ύXCXbh֒ʒm邽߂̃CxgB[1]=TuXbhiBlk[*]ύXCXbh֒ʒm邽߂̃CxgBPɂ邽߂BYTEPAIRENCODINGENCODER_PRINTPROGRESS`ĂȂꍇ[1]쐬邪pȂB
	int			reqExit;		//CXbhATuXbhւ̏IvtO	TuXbh擾kf[^𒴉߂ꍇI܂B
} ST_BytePairEncodingEncoderMT;
static unsigned __stdcall BytePairEncodingEncoderMT_EncodeTry(volatile ST_BytePairEncodingEncoderMT* pMT) {
	const unsigned char* src    = pMT->src;
	int                  srcLen = pMT->srcLen;
	int iBlk, divLev, divCnt, dstLen, blkPos, blkLen, tmpLen;
	//[NmۂB
	ST_BytePairEncodingEncoder* pEncoder = malloc(sizeof(ST_BytePairEncodingEncoder) + srcLen);
	if(!pEncoder) { DIE(); }
	//CXbh̏If̂߂ɁATuXbh̗DxB
	// - CXbhTuXbh̃Cxg҂`Ȃ̂ŁACXbh̕Dx̂K؂łB
	//   CXbhƃTuXbh̗Dx܂܂ɂƁACXbh̏IfxāAŜ̏ԂʂɐLтĂ܂ꍇLB
	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
	for(;;) {
		divLev = InterlockedIncrement((int*)&pMT->divLev);	//Ɏs镪x擾B
		divCnt = (1 << divLev);
		if(divCnt > srcLen) { break; }	//TuXbh擾kf[^𒴉߂ꍇI܂BpMT̓eύXĂȂ̂ŁASetEvent(pMT->hEvent[0/*dstLen*/])͕svłB
		dstLen = 0;
		for(iBlk = 0; iBlk < divCnt; iBlk++) {
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
			pMT->iBlk[divLev] = iBlk;
			SetEvent(pMT->hEvent[1/*iBlk*/]);
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
			//̃ubN̈kf[^߁AZB
			blkPos = (int)((int64_t)srcLen * (int64_t)(iBlk + 0) / (int64_t)divCnt);		//̓švZint64_t^ւ̃LXgK{łBint^̂܂܌vZƁA
			blkLen = (int)((int64_t)srcLen * (int64_t)(iBlk + 1) / (int64_t)divCnt) - blkPos;	//MBȏ̃t@Ck悤ƂɃI[o[t[ĈُI܂B
			tmpLen = BytePairEncodingEncoder_EncodeSub(pEncoder, &src[blkPos], blkLen, NULL, 0);
			if(tmpLen < 0) { dstLen = -1; break; }
			dstLen += tmpLen;
		    //{{͈͍̔͂̂߂̏łB͈̔͂폜Ăʂ͕ς܂B
			if(divLev) {							//݂̕x0łȂA(x-1)\ŁÄ݂kf[^(x-1)̈kf[^𒴂Ał؂B
				int dstLen2 = pMT->dstLen[divLev - 1];			//pMT->dstLen[divLev]Ɋi[ĺApMT->dstLen[divLev-1]傫ȒlłΐmȒlłȂč\Ȃ̂ŁA̎_łdivLevi[鎖ɂB
				if((dstLen2 > 0) && (dstLen > dstLen2)) { break; }	//CXbh́Aׂ荇xɂďI𖞂oāA(x-1)̗p邾낤B(ɂ́A(x-1)ȑO̕xŊďI𖞂ꍇ͂炪̗p邾낤Aɂ搳ʂƂȂB)
			}								//̎lśAIۂŃubNĂXbhłI邽߂̍̍HvłA4s폜Ăʂ͓łB
		    //}}͈͍̔͂̂߂̏łB͈̔͂폜Ăʂ͕ς܂B
			if(pMT->reqExit) { goto L_EXIT; }	//pMT->reqExitZbgĂƂ́ACXbh͂pMT->hEvent҂ĂȂ̂ŁASetEvent(pMT->hEvent[0/*dstLen*/])͕svłB
		}
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
		pMT->iBlk[divLev] = iBlk;
		SetEvent(pMT->hEvent[1/*iBlk*/]);
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
		pMT->dstLen[divLev] = dstLen;
		SetEvent(pMT->hEvent[0/*dstLen*/]);
	}
L_EXIT:	//[NJB
	free(pEncoder);
	return 0;
}
int BytePairEncodingEncoder_EncodeMT(const void* _src, int srcLen, void* _dst, int dstCap) {
	const unsigned char* src = _src;
	      unsigned char* dst = _dst;
	int dstLen = 0;			//(srcLen=0)ꍇ̂߂(dstLen=0)ƂĂB
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
	int newMsgLen, oldMsgLen = 0;
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
	//kf[^łȂ΁c
	if(srcLen) {
		int iBlk, divCnt, divLev = 0;	//(srcLen=1)ꍇ̂߂(divLev=0)ƂĂB
		//[NmۂB
		ST_BytePairEncodingEncoder* pEncoder = malloc(sizeof(ST_BytePairEncodingEncoder) + srcLen);
		if(!pEncoder) { DIE(); }
		//--- œKȕ߂鏈 ---
		if(srcLen != 1) {	//(srcLen=1)O闝ŔA'Mon Feb 02 21:59:02 JST 2015'̃RgQƂB
			volatile ST_BytePairEncodingEncoderMT stMT;
			SYSTEM_INFO si;
			HANDLE hThread[MAXIMUM_WAIT_OBJECTS/*=64*/];	//TuXbh̍ő吔́AWaitForMultipleObjects()ňxɑ҂Ăő吔(64)ƂB
			int i, numThread;
			//\̂B
			memset((void*)&stMT, 0, sizeof stMT);
			stMT.src    = src;
			stMT.srcLen = srcLen;
			stMT.divLev = -1;	//Ɏs镪x-1	InterlockedIncrement()găANZXdivCnt=(1<<߂l)sB̖߂l0Ƃ邽߂-1ɏB
			stMT.hEvent[0/*dstLen*/] = CreateEvent(NULL, FALSE/*ZbgIuWFNg*/, FALSE/*VOi*/, NULL);
			stMT.hEvent[1/*iBlk*/]   = CreateEvent(NULL, FALSE/*ZbgIuWFNg*/, FALSE/*VOi*/, NULL);	//Pɂ邽߂BYTEPAIRENCODINGENCODER_PRINTPROGRESS`ĂȂꍇ[1]쐬邪pȂB
			//TuXbh쐬B
			GetSystemInfo(&si);
			numThread = si.dwNumberOfProcessors;
			if(numThread < 1) { DIE(); }	//L蓾Ȃ͂AÔߊmFB
			if(numThread > ARRAY_SIZE(hThread)) { numThread = ARRAY_SIZE(hThread); }
			for(i = 0; i < numThread; i++) {
				hThread[i] = (HANDLE)_beginthreadex(NULL, 0, BytePairEncodingEncoderMT_EncodeTry, (void*)&stMT, 0, NULL);
				if(!hThread[i]) { DIE(); }
			}
			for(;;) {
				i = WaitForMultipleObjects(2, (HANDLE*)stMT.hEvent, FALSE, INFINITE);	//TuXbh̎̏҂B	LXgvolatilěx}̂߂łBɂ͊֌WL܂B
				switch(i - WAIT_OBJECT_0) {
				default:DIE();
				case 0/*dstLen*/:
					for(divLev = 0; divLev < (32 - 1); divLev++) {		//ׂ荇xɂāc
						int dstLen0 = stMT.dstLen[divLev + 0];		//(x+0)̈kf[^擾BAg~bNłȂč\Ȃ̂ŁAڎQƉ\łB
						int dstLen1 = stMT.dstLen[divLev + 1];		//(x+1)̈kf[^擾B
						if(!dstLen0 || !dstLen1) { break; }		//(x+0),,(x+1)ȂΔāATuXbh̎̏҂B
						if((dstLen0 > 0) &&				//(x+0)\Łc
						   (dstLen1 > 0) &&				//(x+1)\Łc
						   (dstLen0 < dstLen1)) { goto L_EXIT; }	//kf[^Xɓ]AȏׂĂʂƔfāAł؂B
					}
					break;
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
				case 1/*iBlk*/:
					divLev = stMT.divLev;
					if((unsigned)divLev < 32) {	//(divLev<0),,(divLev>31)ɂȂ\L̂ŏO鎖YȂ悤ɁB
						divCnt = (1 << divLev);
						iBlk = stMT.iBlk[divLev];
						newMsgLen = fprintf(stderr, "\rtry srcLen=%d divCnt=%d (%d/%d)", srcLen, divCnt, iBlk, divCnt);
						if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } oldMsgLen = newMsgLen;
					}
					break;
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
				}
			}
L_EXIT:			//TuXbhN[AbvB
			stMT.reqExit = 1;
			i = WaitForMultipleObjects(numThread, hThread, TRUE, INFINITE);
			if((unsigned)(i - WAIT_OBJECT_0) >= (unsigned)numThread) { DIE(); }	//oO
			for(i = 0; i < numThread; i++) { CloseHandle(hThread[i]); }
			//\̂N[AbvB
			CloseHandle(stMT.hEvent[0/*dstLen*/]);
			CloseHandle(stMT.hEvent[1/*iBlk*/]);	//Pɂ邽߂BYTEPAIRENCODINGENCODER_PRINTPROGRESS`ĂȂꍇ[1]쐬邪pȂB
		}
		divCnt = (1 << divLev);
		//--- kf[^o͂鏈 ---
		{
			int blkPos, blkLen, tmpLen;
			dstLen = 0;
			for(iBlk = 0; iBlk < divCnt; iBlk++) {
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
				newMsgLen = fprintf(stderr, "\rout srcLen=%d dstLen=%d divCnt=%d (%d/%d)", srcLen, dstLen, divCnt, iBlk+1, divCnt);
				if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } oldMsgLen = newMsgLen;
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
				//̃ubNkB
				blkPos = (int)((int64_t)srcLen * (int64_t)(iBlk + 0) / (int64_t)divCnt);		//̓švZint64_t^ւ̃LXgK{łBint^̂܂܌vZƁA
				blkLen = (int)((int64_t)srcLen * (int64_t)(iBlk + 1) / (int64_t)divCnt) - blkPos;	//MBȏ̃t@Ck悤ƂɃI[o[t[ĈُI܂B
				tmpLen = BytePairEncodingEncoder_EncodeSub(pEncoder, &src[blkPos], blkLen, &dst[dstLen], dstCap - dstLen);
				if(tmpLen < 0) { DIE(); }	//'œKȕ߂鏈''k\'Ɣfꂽ̂ɁA'ks'Ɣf鎖͗L蓾ȂB
				dstLen += tmpLen;		//kf[^̏o̓TCY𑝂₷B
			}
		}
		//[NJB
		free(pEncoder);
	}
	//(Ɋ蓖Ă̐=0)o͂BfR[_͂āAI[ubNł鎖𔻒fB
	// - ōs'dstLen++'́A'œKȕ߂鏈'ɂ͊܂܂ĂȂ̂ŁAo̓obt@̗LɊ֌WsKvLB
	//   o̓obt@w肳ĂȂɂ̏sȂƁAo̓obt@w肳ĂȂ(-1)̌ʍقB
	if(dstLen < dstCap) { dst[dstLen] = 0; } dstLen++;
#ifdef  BYTEPAIRENCODINGENCODER_PRINTPROGRESS //{{i\p
	newMsgLen = fprintf(stderr, "\rend srcLen=%d dstLen=%d", srcLen, dstLen);
	if(oldMsgLen > newMsgLen) { fprintf(stderr, "%*s", oldMsgLen - newMsgLen, ""); } putc('\n', stderr);
#endif//BYTEPAIRENCODINGENCODER_PRINTPROGRESS //}}i\p
	//kf[^ԂB
	return dstLen;
}
//}}2015/11/14ύX:}`Xbhł̎C܂B
#endif//defined(_WIN32) && defined(_MT)
