[HSPBC] Tips

	～グラフィック～

pget命令よりも高速に一点の色を取得 (要GDI32.DLL)

このＴｉｐｓは如何に短時間で一点の色を取得するかを検証しているものとなります。

「ＨＳＰのpget命令は遅い」と巷では耳にすることがあります。

１．pgetで所定の座標を指定して色を取り出す
２．システム変数に入った色を別の変数に退避する
３．特定の色との比較やら何らかの処理を行う

他のやり方でも処理工程数的には縮められそうもないのに何故遅いといわれるのか？
pget命令の元となるＡＰＩ関数GetPixelを使って違いを実際に計測してみましょう。
ＡＰＩ関数GetPixelについてはコチラを参照してください。

	ll_libload kernel, "kernel32.dll"
	ll_getproc GetTickCount, "GetTickCount", kernel
	ll_libload gdi, "gdi32.dll"
	ll_getproc GetPixel, "GetPixel", gdi

#module
#deffunc getoptime val
	mref time, 16
	ll_callfnv GetTickCount@
	ll_ret time
	return
#global

	// 準備
	randomize
	redraw 0
	repeat 10000
		rnd col, 2 : rnd col.1, 2 : rnd col.2, 2
		color col * 0xFF, col.1 * 0xFF, col.2 * 0xFF
		rnd i, winx - 250 : rnd i.1, winy
		pset i + 250, i.1
	loop
	redraw 1
	// pgetで取得
	repeat 10
		getoptime tms.cnt
		col = 0
		repeat winx * winy
			pget cnt \ winx, cnt / winx
			if rval + (gval << 8) + (bval << 16) = 0x0000FF : col++
		loop
		getoptime tme.cnt
		tm = tme.cnt - tms.cnt
		color
		mes "" + cnt + ". " + tm + "ms 対象色の数=" + col
	loop
	i = 0 : repeat 10 : i += tme.cnt - tms.cnt : loop : i = i / 10
	mes "\npget命令の平均 " + i + "ms\n"
	// ＡＰＩで取得
	mref bmscr, 67
	repeat 10
		getoptime tms.cnt
		col = 0
		repeat winx * winy
			i = bmscr.4, cnt \ winx, cnt / winx
			ll_callfunc i, 3, GetPixel@
			ll_ret colors
			if colors = 0x0000FF : col++
		loop
		getoptime tme.cnt
		tm = tme.cnt - tms.cnt
		color
		mes "" + cnt + ". " + tm + "ms 対象色の数=" + col
	loop
	i = 0 : repeat 10 : i += tme.cnt - tms.cnt : loop : i = i / 10
	mes "\nGetPixelの平均 " + i + "ms\n"
	stop

#uselib "kernel32.dll"
#cfunc GetTickCount "GetTickCount"
#uselib "gdi32.dll"
#cfunc GetPixel "GetPixel" int, int, int

	// 準備
	randomize
	redraw 0
	repeat 10000
		color rnd(2) * 0xFF, rnd(2) * 0xFF, rnd(2) * 0xFF
		pset rnd(ginfo_winx - 250) + 250, rnd(ginfo_winy)
	loop
	redraw 1
	// pgetで取得
	repeat 10
		tms.cnt = GetTickCount()
		col = 0
		repeat ginfo_winx * ginfo_winy
			pget cnt \ ginfo_winx, cnt / ginfo_winx
			if ginfo_r + (ginfo_g << 8) + (ginfo_b << 16) = 0x0000FF : col++
		loop
		tme.cnt = GetTickCount()
		color
		mes strf("%02d回目 %dms 対象色の数=%d", cnt + 1, tme.cnt - tms.cnt, col)
	loop
	foreach tms : i += tme.cnt - tms.cnt : loop
	mes strf("\npget命令の平均 %dms\n", i / length(tms))
	// ＡＰＩで取得
	i = 0
	mref bmscr, 67
	repeat 10
		tms.cnt = GetTickCount()
		col = 0
		repeat ginfo_winx * ginfo_winy
			if GetPixel(bmscr.4, cnt \ ginfo_winx, cnt / ginfo_winx) = 0x0000FF : col++
		loop
		tme.cnt = GetTickCount()
		color
		mes strf("%02d回目 %dms 対象色の数=%d", cnt + 1, tme.cnt - tms.cnt, col)
	loop
	foreach tms : i += tme.cnt - tms.cnt : loop
	mes strf("\nGetPixelの平均 %dms", i / length(tms))

上記スクリプトはランダムに配置したドットの中から赤色が何点あるかをカウントするものです。
ウィンドウサイズである６４０×４８０回分、合計３０７２００回繰り返すと管理人の環境だと、
ＨＳＰ２のpget命令は平均４３４ミリ秒、ＡＰＩ関数は平均４２３ミリ秒。
ＨＳＰ３のpget命令は平均６４３ミリ秒、ＡＰＩ関数は平均４７１ミリ秒に終わりました。
pget命令同士でも「遅くなったと言われるＨＳＰ３」はＨＳＰ２よりも２００ミリ秒遅い結果でした。
ＡＰＩ関数GetPixelを使ったやり方だと、
遅くなったＨＳＰ３の場合はＨＳＰ２のpget命令の平均タイムに近い４００ミリ秒台後半にまで縮まり、
ＨＳＰ２では確かにpget命令よりも速くはなりましたが１０ミリ秒ほどしか縮まりませんでした。

コレ以上縮めることはできないのでしょうか？
「大きな画像の全部または大部分を取得するために繰り返し色取得命令を使うこと」に問題がありそうなので、
pget命令やＡＰＩ関数GetPixelの取得元となる「座標ごとの輝度を一元管理しているメモリ領域」
即ちＶＲＡＭに直接アクセスすれば更に縮小することができそうですね。
ＶＲＡＭはＨＳＰ２・ＨＳＰ３共にmref命令で取得することが可能で、
その方法で同じように赤色が何点あるかをカウントするプログラムを組んでみました。
尚、ＶＲＡＭの説明と扱い方についてコチラを参照してください。

	ll_libload kernel, "kernel32.dll"
	ll_getproc GetTickCount, "GetTickCount", kernel

#module
#deffunc getoptime val
	mref time, 16
	ll_callfnv GetTickCount@
	ll_ret time
	return
#global

	// 準備
	randomize
	redraw 0
	repeat 10000
		rnd col, 2 : rnd col.1, 2 : rnd col.2, 2
		color col * 0xFF, col.1 * 0xFF, col.2 * 0xFF
		rnd i, winx - 250 : rnd i.1, winy
		pset i + 250, i.1
	loop
	redraw 1
	// pgetで取得
	repeat 10
		getoptime tms.cnt
		col = 0
		repeat winx * winy
			pget cnt \ winx, cnt / winx
			if rval + (gval << 8) + (bval << 16) = 0x0000FF : col++
		loop
		getoptime tme.cnt
		tm = tme.cnt - tms.cnt
		color
		mes "" + cnt + ". " + tm + "ms 対象色の数=" + col
	loop
	i = 0 : repeat 10 : i += tme.cnt - tms.cnt : loop : i = i / 10
	mes "\npget命令の平均 " + i + "ms\n"
	// ＶＲＡＭから取得
	mref vram, 66
	ginfo 6
	i.0 = (prmx * 3 + 3) & 0xFFFFFFFC
	repeat 10
		getoptime tms.cnt
		col = 0
		repeat prmx * prmy
			i.1 = (prmy - 1 - (cnt / prmx)) * i + (cnt \ prmx * 3)
			peek i.2, vram, i.1 + 2
			peek i.3, vram, i.1 + 1
			peek i.4, vram, i.1
			if i.2 + (i.3 << 8) + (i.4 << 16) = 0x0000FF : col++
		loop
		getoptime tme.cnt
		tm = tme.cnt - tms.cnt
		color
		mes "" + cnt + ". " + tm + "ms 対象色の数=" + col
	loop
	i = 0 : repeat 10 : i += tme.cnt - tms.cnt : loop : i = i / 10
	mes "\nVRAMの平均 " + i + "ms"
	stop

#uselib "kernel32.dll"
#cfunc GetTickCount "GetTickCount"

	// 準備
	randomize
	redraw 0
	repeat 10000
		color rnd(2) * 0xFF, rnd(2) * 0xFF, rnd(2) * 0xFF
		pset rnd(ginfo_winx - 250) + 250, rnd(ginfo_winy)
	loop
	redraw 1
	// pgetで取得
	repeat 10
		tms.cnt = GetTickCount()
		col = 0
		repeat ginfo_winx * ginfo_winy
			pget cnt \ ginfo_winx, cnt / ginfo_winx
			if ginfo_r + (ginfo_g << 8) + (ginfo_b << 16) = 0x0000FF : col++
		loop
		tme.cnt = GetTickCount()
		color
		mes strf("%02d回目 %dms 対象色の数=%d", cnt + 1, tme.cnt - tms.cnt, col)
	loop
	foreach tms : i += tme.cnt - tms.cnt : loop
	mes strf("\npget命令の平均 %dms\n", i / length(tms))
	// ＶＲＡＭから取得
	mref vram, 66
	i = (ginfo_sx * 3 + 3) & 0xFFFFFFFC
	repeat 10
		tms.cnt = GetTickCount()
		col = 0
		repeat ginfo_sx * ginfo_sy
			i.1 = ((ginfo_sy - 1 - cnt / ginfo_sx) * i) + cnt \ ginfo_sx * 3
			if peek(vram, i.1 + 2) + (peek(vram, i.1 + 1) << 8) + (peek(vram, i.1) << 16) = 0x0000FF : col++
		loop
		tme.cnt = GetTickCount()
		color
		mes strf("%02d回目 %dms 対象色の数=%d", cnt + 1, tme.cnt - tms.cnt, col)
	loop
	i = 0 : foreach tms : i += tme.cnt - tms.cnt : loop
	mes strf("\nVRAMの平均　 %dms", i / length(tms))

結果、ＶＲＡＭでの取得はＨＳＰ２が３３０ミリ秒、ＨＳＰ３は３２５ミリ秒と逆転しました。
逆転はたまたまなのかもわかりませんが、僅差に縮まったことは確かでしょう。

処理回数を減らすことで速度向上するなら、peek命令をwpeek命令やlpeek命令に変えることで
コレ以上に処理時間を縮めることもできるでしょう。
右端の座標を指定するときにパディングデータも含めてしまうとエラーになるので、
その時だけは通常のpeek命令で取得するようにしなければなりません。
ＨＳＰ２は予め変数のポインタをll_getptr命令で取得必要がありますが、
ll_peek命令を使うことで３バイト分だけ指定して取得することが出来ます。

	ll_libload kernel, "kernel32.dll"
	ll_getproc GetTickCount, "GetTickCount", kernel

#module
#deffunc getoptime val
	mref time, 16
	ll_callfnv GetTickCount@
	ll_ret time
	return
#global

	// 準備
	randomize
	redraw 0
	repeat 10000
		rnd col, 2 : rnd col.1, 2 : rnd col.2, 2
		color col * 0xFF, col.1 * 0xFF, col.2 * 0xFF
		rnd i, winx - 250 : rnd i.1, winy
		pset i + 250, i.1
	loop
	redraw 1
	// pgetで取得
	repeat 10
		getoptime tms.cnt
		col = 0
		repeat winx * winy
			pget cnt \ winx, cnt / winx
			if rval + (gval << 8) + (bval << 16) = 0x0000FF : col++
		loop
		getoptime tme.cnt
		tm = tme.cnt - tms.cnt
		color
		mes "" + cnt + ". " + tm + "ms 対象色の数=" + col
	loop
	i = 0 : repeat 10 : i += tme.cnt - tms.cnt : loop : i = i / 10
	mes "\npget命令の平均 " + i + "ms\n"
	// 取得回数減少
	mref vram, 66
	ginfo 6
	ll_getptr vram : ll_ret vram_adr
	i.0 = (prmx * 3 + 3) & 0xFFFFFFFC
	repeat 10
		getoptime tms.cnt
		col = 0
		repeat prmx * prmy
			i.1 = (prmy - 1 - (cnt / prmx)) * i + (cnt \ prmx * 3)
			ll_peek i.2, vram_adr + i.1, 3
			if i.2 = 0xFF0000 : col++
		loop
		getoptime tme.cnt
		tm = tme.cnt - tms.cnt
		color
		mes "" + cnt + ".　 " + tm + "ms 対象色の数=" + col
	loop
	i = 0 : repeat 10 : i += tme.cnt - tms.cnt : loop : i = i / 10
	mes "\n取得回数減少版VRAMの平均 " + i + "ms"
	stop

#uselib "kernel32.dll"
#cfunc GetTickCount "GetTickCount"

	// 準備
	randomize
	redraw 0
	repeat 10000
		color rnd(2) * 0xFF, rnd(2) * 0xFF, rnd(2) * 0xFF
		pset rnd(ginfo_winx - 250) + 250, rnd(ginfo_winy)
	loop
	redraw 1
	// pgetで取得
	repeat 10
		tms.cnt = GetTickCount()
		col = 0
		repeat ginfo_winx * ginfo_winy
			pget cnt \ ginfo_winx, cnt / ginfo_winx
			if ginfo_r + (ginfo_g << 8) + (ginfo_b << 16) = 0x0000FF : col++
		loop
		tme.cnt = GetTickCount()
		color
		mes strf("%02d回目 %dms 対象色の数=%d", cnt + 1, tme.cnt - tms.cnt, col)
	loop
	foreach tms : i += tme.cnt - tms.cnt : loop
	mes strf("\npget命令の平均 %dms\n", i / length(tms))
	// 取得回数減少版ＶＲＡＭから取得
	mref vram, 66
	i = (ginfo_sx * 3 + 3) & 0xFFFFFFFC
	repeat 10
		tms.cnt = GetTickCount()
		col = 0
		repeat ginfo_sx * ginfo_sy
			i.1 = ((ginfo_sy - 1 - cnt / ginfo_sx) * i) + cnt \ ginfo_sx * 3
			if cnt \ ginfo_sx = ginfo_sx - 1 {
				if peek(vram, i.1 + 2) + (peek(vram, i.1 + 1) << 8) + (peek(vram, i.1) << 16) = 0x0000FF : col++
			} else {
				if (lpeek(vram, i.1) & 0xFFFFFF) = 0xFF0000 : col++
			}
		loop
		tme.cnt = GetTickCount()
		color
		mes strf("%02d回目 %dms 対象色の数=%d", cnt + 1, tme.cnt - tms.cnt, col)
	loop
	i = 0 : foreach tms : i += tme.cnt - tms.cnt : loop
	mes strf("\n取得回数減少版VRAMの平均　 %dms", i / length(tms))

取得回数を減少させた結果、ＨＳＰ２が２１８ミリ秒、ＨＳＰ３は２８０ミリ秒でした。
なんと、ＨＳＰ２では１００ミリ秒も縮まってかなり高速化できたようです。
再度ＨＳＰ２とＨＳＰ３の処理時間に開きが出たのは、
右端部取得時にpeek命令を３回繰り返しているか、ll_peek命令１回で取得しているかの違いでしょう。

今回の検証はココまでとして最終結果だけ残しておきます。
ＨＳＰ２のpget命令から取得回数減少版ＶＲＡＭのやり方で、４３４ミリ秒から２１８ミリ秒に、
ＨＳＰ３のpget命令から取得回数減少版ＶＲＡＭのやり方で、６４３ミリ秒から２８０ミリ秒になりました。
コレが最速なやり方かどうかはわかりません…というか、恐らく最速ではないと思います。
以上で検証したように、処理時間削減には取得回数を如何に減らすかがポイントであるようなので、
処理方法に応じて最適なやり方を選択するようにしてください。
尚、どのような時でもpget命令よりＶＲＡＭの方が速いワケではないことを覚えておいてください。
今回のように、ウィンドウ内全てのピクセルデータを調べる必要がある場合、
言い換えると何万回・何十万回も取得処理を繰り返す必要がある場合にＶＲＡＭでの取得は有効ですが、
そんなに取得する必要がない場合はpget命令の方が圧倒的に速くなる可能性もあります。
「今回取得しようとしているもの」がどういうものかを見極めて臨機応変すると良いでしょう。