/*	
 *	clipjpeg.c
 *
 *	JPEG Decoder
 *
 *	CLiP - Common Library for P/ECE
 *	Copyright (C) 2001-2009 Naoyuki Sawa
 *
 *	* Fri Feb 06 22:32:46 JST 2009 Naoyuki Sawa
 *	- 쐬JnB
 *	* Mon Feb 09 23:11:25 JST 2009 Naoyuki Sawa
 *	- 1st [XB
 *	* Thu Feb 12 18:14:19 JST 2009 Naoyuki Sawa
 *	- [J֐jpeg_decoder_get_byte()Ajpeg_decoder_get_half()Ajpeg_decoder_ecs_get_bit()Ajpeg_decoder_ecs_get_bits()AZu܂B
 *	* Thu Feb 12 23:21:31 JST 2009 Naoyuki Sawa
 *	- [J֐jpeg_decoder_ecs_decode_huffman()AZu܂B
 *	* Sun Feb 15 22:46:13 JST 2009 Naoyuki Sawa
 *	- [J֐jpeg_decoder_ecs_decode()̓[vA[J֐jpeg_decoder_ecs_draw_block()ɕAAZu܂B
 *	* Wed Feb 18 17:54:42 JST 2009 Naoyuki Sawa
 *	- [J֐jpeg_decoder_ecs_decode()DC,ACA[J֐jpeg_decoder_ecs_decode_block()ɕAAZu܂B
 */
#include "clip.h"

#ifdef PIECE
#define JPEG_ASM
#endif /*PIECE*/

/****************************************************************************
 *	JPEG_DECODER
 ****************************************************************************/

typedef struct _JPEG_DECODER {
	const unsigned char* data_ptr;					/* +  0, 4 */
	short ecs_bit_pos;						/* +  4, 2 */
	//
	short sof_Y;							/* +  6, 2 */
	short sof_X;							/* +  8, 2 */
	short sof_Nf;							/* + 10, 2 */
	short sof_Ci[3];						/* + 12, 6 */
	short sof_Hi[3], sof_Hi_max;					/* + 18, 8 */
	short sof_Vi[3], sof_Vi_max;					/* + 26, 8 */
	short sof_Tq[3];						/* + 34, 6 */
	//
	short sos_Ns;							/* + 40, 2 */
	short sos_Csi[3];						/* + 42, 6 */
	short sos_Tdi[3];						/* + 48, 6 */
	short sos_Tai[3];						/* + 54, 6 */
	//
	short dri_Ri;							/* + 60, 2 */
	/* padding */							/* + 62, 2 */
	const unsigned char* dqt_Qk[4];					/* + 64,16 */
	const unsigned char* dht_Li[2][2];				/* + 80,16 */
	//
	void (*size_proc)(int w, int h, void* user_data);		/* + 96, 4 */
	void (*draw_proc)(int x, int y, int c, void* user_data);	/* +100, 4 */
	void* user_data;						/* +104, 4 */
} JPEG_DECODER;								/* =108 */

#ifndef JPEG_ASM
static const unsigned char jpeg_decoder_ecs_zigzag_index[64 - 1/*DC*/] = {
	    1, 8,16, 9, 2, 3,10,
	17,24,32,25,18,11, 4, 5,
	12,19,26,33,40,48,41,34,
	27,20,13, 6, 7,14,21,28,
	35,42,49,56,57,50,43,36,
	29,22,15,23,30,37,44,51,
	58,59,52,45,38,31,39,46,
	53,60,61,54,47,55,62,63,
};
#else /*JPEG_ASM*/
static const unsigned char jpeg_decoder_ecs_zigzag_offset[64 - 1/*DC*/] = {
	      1*2, 8*2,16*2, 9*2, 2*2, 3*2,10*2,
	17*2,24*2,32*2,25*2,18*2,11*2, 4*2, 5*2,
	12*2,19*2,26*2,33*2,40*2,48*2,41*2,34*2,
	27*2,20*2,13*2, 6*2, 7*2,14*2,21*2,28*2,
	35*2,42*2,49*2,56*2,57*2,50*2,43*2,36*2,
	29*2,22*2,15*2,23*2,30*2,37*2,44*2,51*2,
	58*2,59*2,52*2,45*2,38*2,31*2,39*2,46*2,
	53*2,60*2,61*2,54*2,47*2,55*2,62*2,63*2,
};
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static int jpeg_decoder_get_byte(JPEG_DECODER* self) {
	return *self->data_ptr++;
}
#else /*JPEG_ASM*/
/*static*/ int jpeg_decoder_get_byte(JPEG_DECODER* self);
asm("
		.code
		.align	1
jpeg_decoder_get_byte:
		ld.w		%r13, [%r12]				;// %r13 := data_ptr = self->data_ptr
		ld.ub		%r10, [%r13]+				;// %r10 := c = *data_ptr++
		ld.w		[%r12], %r13				;// self->data_ptr = data_ptr
		ret
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static int jpeg_decoder_get_half(JPEG_DECODER* self) {
	int hi = jpeg_decoder_get_byte(self);
	int lo = jpeg_decoder_get_byte(self);
	return hi << 8 | lo;
}
#else /*JPEG_ASM*/
/*static*/ int jpeg_decoder_get_half(JPEG_DECODER* self);
asm("
		.code
		.align	1
jpeg_decoder_get_half:
		ld.w		%r13, [%r12]				;// %r13 := data_ptr = self->data_ptr
		ld.ub		%r10, [%r13]+				;// %r10 := hi = *data_ptr++
		ld.ub		%r11, [%r13]+				;// %r10 := lo = *data_ptr++
		ld.w		[%r12], %r13				;// self->data_ptr = data_ptr
		sla		%r10, 8					;// %r10 := hi << 8
		ret.d
		or		%r10, %r11				;// %r10 := hi << 8 | lo					*delay*
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static int jpeg_decoder_ecs_get_bit(JPEG_DECODER* self) {
	int bit = (*self->data_ptr >> (self->ecs_bit_pos ^ 7)) & 1;
	if(!(self->ecs_bit_pos = (self->ecs_bit_pos + 1) & 7)) {
		if(*self->data_ptr++ == 0xFF) {
			if(*self->data_ptr++) { /* Byte stuffing */
				DIE();
			}
		}
	}
	return bit;
}
#else /*JPEG_ASM*/
/*static*/ int jpeg_decoder_ecs_get_bit(JPEG_DECODER* self);
asm("
		.code
		.align	1
jpeg_decoder_ecs_get_bit:
		ld.w		%r13, [%r12]+				;// %r13 := data_ptr    = self->data_ptr
		ld.h		%r14, [%r12]				;// %r14 := ecs_bit_pos = self->ecs_bit_pos
		ld.b		%r15, [%r13]+				;// %r15 := c = (char)*data_ptr++
		rl		%r15, %r14				;// %r15 := c = ???????? ???????? ???????? b??????? (b = bit)
		add		%r14, 1					;// %r14 :=                  ecs_bit_pos + 1
		and		%r14, 7					;// %r14 :=                 (ecs_bit_pos + 1) & 7
		ld.h		[%r12], %r14				;//     self->ecs_bit_pos = (ecs_bit_pos + 1) & 7
		jrne.d		jpeg_decoder_ecs_get_bit_RET		;// if(!self->ecs_bit_pos) {
		mirror		%r10, %r15				;// %r10 :=     ???????? ???????? ???????? ???????b (b = bit)	*delay*
		add		%r15, 1					;//   %psr(C) := (c == -1) ? 1 : 0
		adc		%r13, %r8				;//   %r13 := data_ptr + %psr(C)
		sub		%r12, 4					;//   %r12 := self
		ld.w		[%r12], %r13				;//   self->data_ptr = data_ptr
jpeg_decoder_ecs_get_bit_RET:						;// }
		ret.d
		and		%r10, 1					;// %r10 :=     00000000 00000000 00000000 0000000b (b = bit)	*delay*
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static int jpeg_decoder_ecs_get_bits(JPEG_DECODER* self, int bits) {
	int v = 0;
	if(bits) {
		v = jpeg_decoder_ecs_get_bit(self);
		if(v) {
			/** no job **/	/* 00000000 00000000 00000000 00000001 */
		} else {
			v = -2;		/* 11111111 11111111 11111111 11111110 */
		}
		while(--bits) {
			v = v << 1 | jpeg_decoder_ecs_get_bit(self);
		}
		if(v < 0) {
			v += 1;
		}
	}
	return v;
}
#else /*JPEG_ASM*/
/*static*/ int jpeg_decoder_ecs_get_bits(JPEG_DECODER* self, int bits);
asm("
		.code
		.align	1
jpeg_decoder_ecs_get_bits:
		cmp		%r13, 0					;// if(bits) {
		jreq		jpeg_decoder_ecs_get_bits_RET		;//   
		pushn		%r2					;//   
		ld.w		%r0, %r12				;//   %r0  := self
		call.d		jpeg_decoder_ecs_get_bit		;//   %r10 := v = jpeg_decoder_ecs_get_bit(self)
		ld.w		%r1, %r13				;//   %r1  := bits						*delay*
		sub		%r10, 1					;//   %r10 := v = (1 -> 0     ) or (0 -> -1      )
		xor		%r10, 1					;//   %r10 := v = (1 -> 0 -> 1) or (0 -> -1 -> -2)
		sub		%r1, 1					;//   %r1  := bits--
		jreq.d		jpeg_decoder_ecs_get_bits_L2		;//   if(bits) {
		ld.w		%r2, %r10				;//   %r2  := v							*delay*
jpeg_decoder_ecs_get_bits_L1:						;//     do {
		call.d		jpeg_decoder_ecs_get_bit		;//       %r10 := tmp = jpeg_decoder_ecs_get_bit(self)
		ld.w		%r12, %r0				;//       %r12 := self						*delay*
		sla		%r2, 1					;//       %r2  := v <<= 1
		sub		%r1, 1					;//       %r1  := bits--
		jrne.d		jpeg_decoder_ecs_get_bits_L1		;//     } while(bits)
		or		%r2, %r10				;//       %r2  := v |= tmp					*delay*
jpeg_decoder_ecs_get_bits_L2:						;//   }
		ld.w		%r13, %r2				;//   %r13 := v
		add		%r2, %r2				;//   %psr(C) := (v < 0) ? 1 : 0
		adc		%r13, %r8				;//   %r13 := v += %psr(C)
		popn		%r2					;//   
jpeg_decoder_ecs_get_bits_RET:						;// }
		ret.d							;// 
		ld.w		%r10, %r13				;// %r10 := v							*delay*
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static int jpeg_decoder_ecs_decode_huffman(JPEG_DECODER* self, const unsigned char* Li) {
	const unsigned char* Vij = Li + 16;
	int code = 0;
	int code_min = 0;
	int code_max;
	int code_num;
	for(;;) {
		code = code << 1 | jpeg_decoder_ecs_get_bit(self);
		code_num = *Li++;
		code_max = code_min + code_num;
		if(/*(code >= code_min) &&*/ (code < code_max)) {
		// ~~~~~~~~~~~~~~~~~~~~~~~~~̏͏ɐ̂Ŕsv
			return Vij[code - code_min];
		}
		Vij += code_num;
		code_min = code_max << 1;
	}
}
#else /*JPEG_ASM*/
/*static*/ int jpeg_decoder_ecs_decode_huffman(JPEG_DECODER* self, const unsigned char* Li);
asm("
		.code
		.align	1
jpeg_decoder_ecs_decode_huffman:
		pushn		%r3
		xsub		%sp, %sp, 4
		xld.w		[%sp], %r12				;// [%sp] := self
		ld.w		%r0, %r13				;// %r0   := Li
		xadd		%r1, %r13, 16				;// %r1   := Vij
		ld.w		%r2, 0					;// %r2   := code
		ld.w		%r3, 0					;// %r3   := code_min
jpeg_decoder_ecs_decode_huffman_L1:					;// for(;;) {
		xld.w		%r12, [%sp]				;//   %r12  := self
		call.d		jpeg_decoder_ecs_get_bit		;//   %r10  :=                    jpeg_decoder_ecs_get_bit(self)
		sla		%r2, 1					;//   %r2   :=        code << 1					*delay
		or		%r2, %r10				;//   %r12  := code = code << 1 | jpeg_decoder_ecs_get_bit(self)
		ld.ub		%r4, [%r0]+				;//   %r4   := code_num = *Li++
		add		%r3, %r4				;//   %r3   := code_max = code_min + code_num
		cmp		%r2, %r3				;//   if(code < code_max)
		jrlt.d		jpeg_decoder_ecs_decode_huffman_RET	;//     break
		add		%r1, %r4				;//   %r1   := Vij += code_num					*delay*
		jp.d		jpeg_decoder_ecs_decode_huffman_L1	;//   
		sla		%r3, 1					;//   %r3   := code_min = code_max << 1				*delay*
jpeg_decoder_ecs_decode_huffman_RET:					;// }
		add		%r1, %r2				;// %r1  :=                         &Vij[code + code_num           ]
		sub		%r1, %r3				;// %r1  := &Vij[code - code_min] = &Vij[code + code_num - code_max]
		ld.ub		%r10, [%r1]				;// %r10 :=  Vij[code - code_min]
		xadd		%sp, %sp, 4
		popn		%r3
		ret
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static void jpeg_decoder_ecs_decode_block(
	JPEG_DECODER* self,
	const unsigned char* Li_dc,
	short* dc,
	const unsigned char* Qk,
	const unsigned char* Li_ac,
	short block[/*64*/]) {

	int i;
	int ac;
	int rrrr;
	int ssss;
	int rrrrssss;
	const unsigned char* z;

	/* DC */
	ssss = jpeg_decoder_ecs_decode_huffman(self, Li_dc);
	*dc += jpeg_decoder_ecs_get_bits(self, ssss);
	block[0] = *dc * *Qk++;

	/* AC */
	i = (64 - 1/*DC*/);
	z = jpeg_decoder_ecs_zigzag_index;
	do {
		rrrrssss = jpeg_decoder_ecs_decode_huffman(self, Li_ac);
		if(!rrrrssss) { /* EOB */
			break;
		}
		rrrr = rrrrssss >> 4;
		ssss = rrrrssss & 15;
		i  -= rrrr;
		z  += rrrr;
		Qk += rrrr;
		ac = jpeg_decoder_ecs_get_bits(self, ssss);
		block[*z++] = ac * *Qk++;
	} while(--i);
}
#else /*JPEG_ASM*/
/*static*/ void jpeg_decoder_ecs_decode_block(
	JPEG_DECODER* self,
	const unsigned char* Li_dc,
	short* dc,
	const unsigned char* Qk,
	const unsigned char* Li_ac,
	short block[/*64*/]);
asm("
		.code
		.align	1
jpeg_decoder_ecs_decode_block:
		;// %r12    := self
		;// %r13    := Li_dc
		;// %r14    := dc
		;// %r15    := Qk
		;// [%sp+0] := retp
		;// [%sp+4] := Li_ac
		;// [%sp+8] := block
		pushn		%r3
		ld.w		%r0, %r14				;// %r0  := dc
		ld.h		%r1, [%r0]				;// %r1  := *dc
		ld.w		%r2, %r15				;// %r2  := Qk
		;//-----------------------------------------------------;
		;// %r0      := dc
		;// %r1      := *dc
		;// %r2      := Qk
		;// %r12     := self
		;// %r13     := Li_dc
		;// [%sp+0]  := %r1
		;// [%sp+4]  := %r2
		;// [%sp+8]  := %r2
		;// [%sp+12] := %r2
		;// [%sp+16] := retp
		;// [%sp+20] := Li_ac
		;// [%sp+24] := block
		call.d		jpeg_decoder_ecs_decode_huffman		;// %r10 := ssss = jpeg_decoder_ecs_decode_huffman(self, Li_dc)
		ld.w		%r3, %r12				;// %r3  :=                                        self			*delay*
		ld.w		%r13, %r10				;// %r13 :=                                        ssss
		call.d		jpeg_decoder_ecs_get_bits		;// %r10 := diff = jpeg_decoder_ecs_get_bits(self, ssss)
		ld.w		%r12, %r3				;// %r12 :=                                  self			*delay*
		add		%r10, %r1				;// %r10 := *dc + diff
		ld.h		[%r0], %r10				;// *dc  := *dc + diff
		;// %r2  := Qk
		;// %r3  := self
		;// %r10 := *dc + diff
		xld.w		%r4, [%sp+24]				;// %r4  := block
		ld.ub		%r5, [%r2]+				;// %r5  :=                            *Qk++
		mlt.h		%r5, %r10				;// %alr :=             (*dc + diff) * *Qk++
		ld.w		%r5, %alr				;// %r5  :=             (*dc + diff) * *Qk++
		ld.h		[%r4], %r5				;//         block[0] := (*dc + diff) * *Qk++
		;//-----------------------------------------------------;
		xld.w		%r0, 63					;// %r0  := i = (64 - 1/*DC*/)
		xld.w		%r1, jpeg_decoder_ecs_zigzag_offset	;// %r1  := z = jpeg_decoder_ecs_zigzag_offset
jpeg_decoder_ecs_decode_block_L1:					;// do {
		;// %r0  := i
		;// %r1  := z
		;// %r2  := Qk
		;// %r3  := self
		xld.w		%r13, [%sp+20]				;//   %r13 :=                                                  Li_ac
		call.d		jpeg_decoder_ecs_decode_huffman		;//   %r10 := rrrrssss = jpeg_decoder_ecs_decode_huffman(self, Li_ac)
		ld.w		%r12, %r3				;//   %r12 :=                                            self		*delay*
		cmp		%r10, 0					;//   if(!rrrrssss)
		jreq		jpeg_decoder_ecs_decode_block_L2	;//     break
		xand		%r13, %r10, 15				;//   %r13 := ssss = rrrrssss & 15
		sra		%r10, 4					;//   %r10 := rrrr = rrrrssss >> 4
		sub		%r0, %r10				;//   %r0  := i  -= rrrr
		add		%r1, %r10				;//   %r1  := z  += rrrr
		add		%r2, %r10				;//   %r2  := Qk += rrrr
		call.d		jpeg_decoder_ecs_get_bits		;//   %r10 := ac = jpeg_decoder_ecs_get_bits(self, ssss)
		ld.w		%r12, %r3				;//   %r12 :=                                self			*delay*
		;// %r0  := i
		;// %r1  := z
		;// %r2  := Qk
		;// %r3  := self
		;// %r10 := ac
		xld.w		%r4, [%sp+24]				;//   %r4  :=  block
		ld.ub		%r5, [%r1]+				;//   %r5  :=        *z++
		add		%r4, %r5				;//   %r4  := &block[*z++]
		ld.ub		%r5, [%r2]+				;//   %r5  :=                     *Qk++
		mlt.h		%r5, %r10				;//   %alr :=                ac * *Qk++
		ld.w		%r5, %alr				;//   %r5  :=                ac * *Qk++
		ld.h		[%r4], %r5				;//            block[*z++] = ac * *Qk++
		;// %r0  := i
		;// %r1  := z
		;// %r2  := Qk
		;// %r3  := self
		sub		%r0, 1					;//   %r0  := i--
		jrne		jpeg_decoder_ecs_decode_block_L1	;// } while(i)
jpeg_decoder_ecs_decode_block_L2:
		;//-----------------------------------------------------;
		popn		%r3
		ret
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

#ifndef JPEG_ASM
static void jpeg_decoder_ecs_draw_block(
	void (*draw_proc)(int x, int y, int c, void* user_data),
	void* user_data,
	int sof_X,
	int sof_Y,
	int org_x,
	int org_y,
	int mag_x,
	int mag_y,
	const short block[/*64*/]) {

	int c;
	int x;
	int y;
	int src_x;
	int src_y;
	int dst_x;
	int dst_y;

	y = org_y;
	src_y = 8;
	do {
		x = org_x;
		src_x = 8;
		do {
			c = *block++ + 128;
			if(c <   0) c =   0;
			if(c > 255) c = 255;
			dst_y = mag_y;
			do {
				dst_x = mag_x;
				do {
					if((x < sof_X) && (y < sof_Y)) {
						draw_proc(x, y, c, user_data);
					}
					x++;
				} while(--dst_x);
				y++;
				x -= mag_x;
			} while(--dst_y);
			x += mag_x;
			y -= mag_y;
		} while(--src_x);
		y += mag_y;
	} while(--src_y);
}
#else /*JPEG_ASM*/
/*static*/ void jpeg_decoder_ecs_draw_block(
	void (*draw_proc)(int x, int y, int c, void* user_data),
	void* user_data,
	int sof_X,
	int sof_Y,
	int org_x,
	int org_y,
	int mag_x,
	int mag_y,
	const short block[/*64*/]);
asm("
		.code
		.align	1
jpeg_decoder_ecs_draw_block:
		;// %r12     := draw_proc
		;// %r13     := user_data
		;// %r14     := sof_X
		;// %r15     := sof_Y
		;// [%sp+0]  := retp
		;// [%sp+4]  := org_x
		;// [%sp+8]  := org_y
		;// [%sp+12] := mag_x
		;// [%sp+16] := mag_y
		;// [%sp+20] := block
		pushn		%r3
		xsub		%sp, %sp, 24
		xld.w		[%sp+8], %r12				;// [%sp+8]  := draw_proc
		xld.w		[%sp+12], %r13				;// [%sp+12] := user_data
		xld.w		[%sp+16], %r14				;// [%sp+16] := sof_X
		xld.w		[%sp+20], %r15				;// [%sp+20] := sof_Y
		;//-----------------------------------------------------;
		;// [%sp+8]  := draw_proc
		;// [%sp+12] := user_data
		;// [%sp+16] := sof_X
		;// [%sp+20] := sof_Y
		;// [%sp+24] := %r2
		;// [%sp+28] := %r3
		;// [%sp+32] := %r0
		;// [%sp+36] := %r1
		;// [%sp+40] := retp
		;// [%sp+44] := org_x
		;// [%sp+48] := org_y
		;// [%sp+52] := mag_x
		;// [%sp+56] := mag_y
		;// [%sp+60] := block
		xld.w		%r15, [%sp+60]				;// %r15 := block
		xld.w		%r13, [%sp+48]				;// %r13 := y = org_y
		ld.w		%r1, 8					;// %r1  := src_y = 8
jpeg_decoder_ecs_draw_block_SRC_Y:					;// do {
		;//-----------------------------------------------------;
		;// %r1  := src_y
		;// %r13 := y
		;// %r15 := block
		xld.w		%r12, [%sp+44]				;//   %r12 := x = org_x
		ld.w		%r0, 8					;//   %r0  := src_x = 8
jpeg_decoder_ecs_draw_block_SRC_X:					;//   do {
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r12 := x
		;// %r13 := y
		;// %r15 := block
		ld.h		%r14, [%r15]+				;//     %r14 := c = [127 .. -128]
		xsub		%r14, %r14, 127				;//     %r14 := c = [  0 .. -255]
		jrle		2					;//     if(c > 0)
		 ld.w		%r14, 0					;//       %r14 := c = 0
		xadd		%r14, %r14, 255				;//     %r14 := c = [255 ..    0]
		jrge		2					;//     if(c < 0)
		 ld.w		%r14, 0					;//       %r14 := c = 0
		xld.w		%r3, [%sp+56]				;//     %r3  := dst_y = mag_y
jpeg_decoder_ecs_draw_block_DST_Y:					;//     do {
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r3  := dst_y
		;// %r12 := x
		;// %r13 := y
		;// %r14 := c
		;// %r15 := block
		xld.w		%r2, [%sp+52]				;//       %r2  := dst_x = mag_x
jpeg_decoder_ecs_draw_block_DST_X:					;//       do {
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r2  := dst_x
		;// %r3  := dst_y
		;// %r12 := x
		;// %r13 := y
		;// %r14 := c
		;// %r15 := block
		xld.w		%r10, [%sp+16]				;//         %r10 := sof_X
		xld.w		%r11, [%sp+20]				;//         %r11 := sof_Y
		cmp		%r12, %r10				;//         if((x < sof_X) &&
		jrge		jpeg_decoder_ecs_draw_block_SKIP
		cmp		%r13, %r11				;//            (y < sof_Y)) {
		jrge		jpeg_decoder_ecs_draw_block_SKIP
		xld.w		[%sp+0], %r12				;//           [%sp+0]  := x
		xld.w		[%sp+4], %r13				;//           [%sp+4]  := y
		xld.w		[%sp+60], %r15				;//           [%sp+60] := block
		xld.w		%r9, [%sp+8]				;//           %r9  := draw_proc
		xld.w		%r15, [%sp+12]				;//           %r15 := user_data
		call		%r9					;//           draw_proc(x, y, c, user_data)
		xld.w		%r12, [%sp+0]				;//           %r12 := x
		xld.w		%r13, [%sp+4]				;//           %r13 := y
		xld.w		%r15, [%sp+60]				;//           %r15 := block
jpeg_decoder_ecs_draw_block_SKIP:					;//         }
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r2  := dst_x
		;// %r3  := dst_y
		;// %r12 := x
		;// %r13 := y
		;// %r15 := block
		sub		%r2, 1					;//         %r2  := dst_x--
		jrne.d		jpeg_decoder_ecs_draw_block_DST_X	;//       } while(dst_x)
		add		%r12, 1					;//         %r12 := x++				*delay*
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r2  := dst_x
		;// %r3  := dst_y
		;// %r12 := x
		;// %r13 := y
		;// %r15 := block
		xld.w		%r10, [%sp+52]				;//       %r10 := mag_x
		add		%r13, 1					;//       %r13 := y++
		sub		%r3, 1					;//       %r3  := dst_y--
		jrne.d		jpeg_decoder_ecs_draw_block_DST_Y	;//     } while(dst_y)
		sub		%r12, %r10				;//       %r12 := x -= mag_x			*delay*
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r2  := dst_x
		;// %r3  := dst_y
		;// %r10 := mag_x
		;// %r12 := x
		;// %r13 := y
		;// %r15 := block
		xld.w		%r11, [%sp+56]				;//     %r11 := mag_y
		add		%r12, %r10				;//     %r12 := x += mag_x
		sub		%r0, 1					;//     %r0  := src_x--
		jrne.d		jpeg_decoder_ecs_draw_block_SRC_X	;//   } while(src_x)
		sub		%r13, %r11				;//     %r13  := y -= mag_y			*delay*
		;//-----------------------------------------------------;
		;// %r0  := src_x
		;// %r1  := src_y
		;// %r2  := dst_x
		;// %r3  := dst_y
		;// %r11 := mag_y
		;// %r12 := x
		;// %r13 := y
		;// %r15 := block
		sub		%r1, 1					;//   %r1  := src_y--
		jrne.d		jpeg_decoder_ecs_draw_block_SRC_Y	;// } while(src_y)
		add		%r13, %r11				;//   %r13 := y += mag_y			*delay*
		;//-----------------------------------------------------;
		xadd		%sp, %sp, 24
		popn		%r3
		ret
");
#endif /*JPEG_ASM*/

/*--------------------------------------------------------------------------*/

static void jpeg_decoder_ecs_decode(JPEG_DECODER* self) {
#define COMPONENT 0	/* JFIF: 0 = Y, 1 = Cb, 2 = Cr */
	const int mag_x = self->sof_Hi_max / self->sof_Hi[COMPONENT/*sof_i*/];
	const int mag_y = self->sof_Vi_max / self->sof_Vi[COMPONENT/*sof_i*/];
	const int mcu_cols = (((self->sof_X + 7) >> 3) + (self->sof_Hi_max - 1)) / self->sof_Hi_max;
	const int mcu_rows = (((self->sof_Y + 7) >> 3) + (self->sof_Vi_max - 1)) / self->sof_Vi_max;
	int sof_i;
	int sos_i;
	int mcu_i = 0;
	int mcu_col;
	int mcu_row;
	int block_col;
	int block_row;
	int RSTm = 0xFFD0/*RST0*/;
	short dc[3] = { 0, 0, 0 };
	short block[64];

	mcu_col = 0;
	mcu_row = 0;
	for(;;) {
		sof_i = 0;
		sos_i = 0;
		do {
			while(self->sof_Ci[sof_i] != self->sos_Csi[sos_i]) {
				if(++sof_i == self->sof_Nf) {
					DIE();
				}
			}
			block_col = 0;
			block_row = 0;
			for(;;) {
				memset(block, 0, sizeof block);
				jpeg_decoder_ecs_decode_block(
					self,
					self->dht_Li[0][self->sos_Tdi[sos_i]],
					&dc[sof_i],
					self->dqt_Qk[self->sof_Tq[sof_i]],
					self->dht_Li[1][self->sos_Tai[sos_i]],
					block);
				if((sof_i == COMPONENT) && self->draw_proc) {
					idct8x8half(block, block);
					jpeg_decoder_ecs_draw_block(
						self->draw_proc,
						self->user_data,
						self->sof_X,
						self->sof_Y,
						(mcu_col * self->sof_Hi_max + block_col * mag_x) << 3,
						(mcu_row * self->sof_Vi_max + block_row * mag_y) << 3,
						mag_x,
						mag_y,
						block);
				}
				if(++block_col == self->sof_Hi[sof_i]) {
					block_col = 0;
					if(++block_row == self->sof_Vi[sof_i]) {
						break;
					}
				}
			}
		} while(++sos_i < self->sos_Ns);
		if(++mcu_col == mcu_cols) {
			mcu_col = 0;
			if(++mcu_row == mcu_rows) {
				break;
			}
		}
		if(++mcu_i == self->dri_Ri) {
			mcu_i = 0;
			while(self->ecs_bit_pos) { /* Byte alignment */
				jpeg_decoder_ecs_get_bit(self);
			}
			if(jpeg_decoder_get_half(self) != RSTm) { /* RSTm */
				DIE();
			}
			RSTm = (RSTm + 1) & ~8;
			memset(dc, 0, sizeof dc);
		}
	}
	while(self->ecs_bit_pos) { /* Byte alignment */
		jpeg_decoder_ecs_get_bit(self);
	}
#undef COMPONENT
}

/*--------------------------------------------------------------------------*/

static void jpeg_decoder_decode(JPEG_DECODER* self) {
	for(;;) {
		int mark_hi = jpeg_decoder_get_byte(self);
		int mark_lo = jpeg_decoder_get_byte(self);
		if(mark_hi != 0xFF) {
			DIE();
		}
		switch(mark_lo) {
		case 0xD8/*SOI*/:
			{
				/** no job **/
			}
			break;
		case 0xDB/*DQT*/:
			{
				int Lq = jpeg_decoder_get_half(self) - 2; /* Lq */
				int Tq;
				do {
					Tq = jpeg_decoder_get_byte(self); /* Pq(=0),Tq */
					self->dqt_Qk[Tq] = self->data_ptr;
					self->data_ptr += 64; /* Qk */
				} while(Lq -= (1 + 64));
			}
			break;
		case 0xC0/*SOF0*/:
			{
				int Hi_Vi;
				int i;
				jpeg_decoder_get_half(self); /* Lf */
				jpeg_decoder_get_byte(self); /* P(=8) */
				self->sof_Y = jpeg_decoder_get_half(self); /* Y */
				self->sof_X = jpeg_decoder_get_half(self); /* X */
				self->sof_Nf = jpeg_decoder_get_byte(self); /* Nf */
				if((self->sof_Nf != 1) && (self->sof_Nf != 3)) { /* JFIF */
					DIE();
				}
				for(i = 0; i < self->sof_Nf; i++) {
					self->sof_Ci[i] = jpeg_decoder_get_byte(self); /* Ci */
					Hi_Vi = jpeg_decoder_get_byte(self); /* Hi,Vi */
					if((self->sof_Hi[i] = Hi_Vi >> 4) > self->sof_Hi_max) self->sof_Hi_max = self->sof_Hi[i];
					if((self->sof_Vi[i] = Hi_Vi & 15) > self->sof_Vi_max) self->sof_Vi_max = self->sof_Vi[i];
					self->sof_Tq[i] = jpeg_decoder_get_byte(self); /* Tqi */
				}
				if(self->size_proc) {
					self->size_proc(self->sof_X, self->sof_Y, self->user_data);
				}
			}
			break;
		case 0xC4/*DHT*/:
			{
				int Lh = jpeg_decoder_get_half(self) - 2; /* Lh */
				int Tc_Th;
				int mt;
				int i;
				do {
					Tc_Th = jpeg_decoder_get_byte(self); /* Tc,Th */
					self->dht_Li[Tc_Th >> 4][Tc_Th & 15] = self->data_ptr;
					mt = 0; /* mt */
					for(i = 0; i < 16; i++) {
						mt += jpeg_decoder_get_byte(self); /* Li */
					}
					self->data_ptr += mt; /* Vi,j */
				} while(Lh -= (1 + 16 + mt));
			}
			break;
		case 0xDD/*DRI*/:
			{
				jpeg_decoder_get_half(self); /* Lr */
				self->dri_Ri = jpeg_decoder_get_half(self); /* Ri */
			}
			break;
		case 0xDA/*SOS*/:
			{
				int Tdi_Tai;
				int i;
				jpeg_decoder_get_half(self); /* Ls */
				self->sos_Ns = jpeg_decoder_get_byte(self); /* Ns */
				for(i = 0; i < self->sos_Ns; i++) {
					self->sos_Csi[i] = jpeg_decoder_get_byte(self); /* Csi */
					Tdi_Tai = jpeg_decoder_get_byte(self); /* Tdi,Tai */
					self->sos_Tdi[i] = Tdi_Tai >> 4;
					self->sos_Tai[i] = Tdi_Tai & 15;
				}
				jpeg_decoder_get_byte(self); /* Ss */
				jpeg_decoder_get_byte(self); /* Se */
				jpeg_decoder_get_byte(self); /* Ah,Al */
				jpeg_decoder_ecs_decode(self);
			}
			break;
		case 0xD9/*EOI*/:
			{
				return; /* ܂ */
			}
		case 0x00:
			{
				DIE();
			}
		case 0xFF:
			{
				self->data_ptr--; /* Fill byte */
			}
			break;
		default:
			{
				self->data_ptr += (jpeg_decoder_get_half(self) - 2); /* L? */
			}
			break;
		}
	}
}

/*--------------------------------------------------------------------------*/

const void*
jpeg_decode(const void* data_ptr,
	void (*size_proc)(int w, int h, void* user_data),
	void (*draw_proc)(int x, int y, int c, void* user_data),
	void* user_data)
{
	JPEG_DECODER jpeg_decoder;

	memset(&jpeg_decoder, 0, sizeof jpeg_decoder);
	jpeg_decoder.data_ptr = data_ptr;
	jpeg_decoder.size_proc = size_proc;
	jpeg_decoder.draw_proc = draw_proc;
	jpeg_decoder.user_data = user_data;
	jpeg_decoder_decode(&jpeg_decoder);

	return jpeg_decoder.data_ptr;
}

