this codelet allows you to emit asm instructions at comile- or runtime, even if the GNU assembler does not yet supports them. Thus we can easily play with the VFPU instruction set. The example below demonstrates this by initializing the VFPU vector register set to zero and then loading an identity matrix in a single instruction.
This raw example only contains some of the most basic instructions and register names (only GPRs, vector and matrix Quadword addressing). Nevertheless you should get the idea how to add new opcodes and play with them.
All opcode and instruction definitions are defined in codegen.h, here they also can get documented. To try, create a directory pspgl/test-vfpu/ and copy the following files into this folder:
Makefile:
Code: Select all
ARCH = psp-
CC = $(ARCH)gcc
PSP_INSTALL = ../tools/psp-install
RM = rm -f
PSPPATH := $(shell psp-config --pspsdk-path)
LIBS = -lpspdebug -lpspdisplay -lpspge -lpspsdk -lpspctrl -lm -lc -lpspuser -lpspkernel
CFLAGS = -g -Wall -O2 -MD -I$(PSPPATH)/include 
LFLAGS = -g -Wall -O2 -L$(PSPPATH)/lib $(LIBS)
TARGET = test-vfpu 
OBJS = main.o
BUILDDATE = $(shell date "+%Y/%m/%d %k:%M:%S")
PSPSDK=$(shell psp-config --pspsdk-path)
all: $(TARGET)
.c.o:
	$(CC) $(CFLAGS) -c $<
$(TARGET): $(OBJS)
	$(CC) $(OBJS) $(LFLAGS) -o $@
install: all
	$(PSP_INSTALL) $(TARGET) --eboot-title="$(TARGET) $(BUILDDATE)"
clean:
	$(RM) $(TARGET) *.d *.o *.a *.elf *.sfo EBOOT.PBP
-include $(wildcard *.d) dummy
codegen.h:
Code: Select all
#ifndef __codegen_h__
#define __codegen_h__
/* GPR register set */
#define  R_zero 0
#define  R_at   1
#define  R_v0   2
#define  R_v1   3
#define  R_a0   4
#define  R_a1   5
#define  R_a2   6
#define  R_a3   7
#define  R_a4   8
#define  R_a5   9
#define  R_v6   10
#define  R_v7   11
#define  R_t0   12
#define  R_t1   13
#define  R_t2   14
#define  R_t3   15
#define  R_s0   16
#define  R_s1   17
#define  R_s2   18
#define  R_s3   19
#define  R_s4   20
#define  R_s5   21
#define  R_s6   22
#define  R_s7   23
#define  R_t8   24
#define  R_t9   25
#define  R_k0   26
#define  R_k1   27
#define  R_gp   28
#define  R_sp   29
#define  R_s8   30
#define  R_ra   31
/* VFPU registers, Quadword addressing */
#define  Q_C000  0	/* First digit specifies matrix, second the row */
#define  Q_C010  1
#define  Q_C020  2
#define  Q_C030  3
#define  Q_C100  4
#define  Q_C110  5
#define  Q_C120  6
#define  Q_C130  7
#define  Q_C200  8
#define  Q_C210  9
#define  Q_C220  10
#define  Q_C230  11
#define  Q_C300  12
#define  Q_C310  13
#define  Q_C320  14
#define  Q_C330  15
#define  Q_C400  16
#define  Q_C410  17
#define  Q_C420  18
#define  Q_C430  19
#define  Q_C500  20
#define  Q_C510  21
#define  Q_C520  22
#define  Q_C530  23
#define  Q_C600  24
#define  Q_C610  25
#define  Q_C620  26
#define  Q_C630  27
#define  Q_C700  28
#define  Q_C710  29
#define  Q_C720  30
#define  Q_C730  31
#define  Q_R000  32	/* First Digit specifies matrix, third the column */
#define  Q_R001  33
#define  Q_R002  34
#define  Q_R003  35
#define  Q_R100  36
#define  Q_R101  37
#define  Q_R102  38
#define  Q_R103  39
#define  Q_R200  40
#define  Q_R201  41
#define  Q_R202  42
#define  Q_R203  43
#define  Q_R300  44
#define  Q_R301  45
#define  Q_R302  46
#define  Q_R303  47
#define  Q_R400  48
#define  Q_R401  49
#define  Q_R402  50
#define  Q_R403  51
#define  Q_R500  52
#define  Q_R501  53
#define  Q_R502  54
#define  Q_R503  55
#define  Q_R600  56
#define  Q_R601  57
#define  Q_R602  58
#define  Q_R603  59
#define  Q_R700  60
#define  Q_R701  61
#define  Q_R702  62
#define  Q_R703  63
/* VFPU registers, 4x4 Matrix (Quad) addressing */
#define  Q_M000  0	/* First digit specifies matrix */
#define  Q_M100  4
#define  Q_M200  8
#define  Q_M300  12
#define  Q_M400  16
#define  Q_M500  20
#define  Q_M600  24
#define  Q_M700  28
#define  Q_E000  32
#define  Q_E100  36
#define  Q_E200  40
#define  Q_E300  44
#define  Q_E400  48
#define  Q_E500  52
#define  Q_E600  56
#define  Q_E700  60
/*
+-------------+------------+---------+---------------------------------------+
|31         26|25        21|20     16|15                                   0 |
+-------------+------------+---------+---------------------------------------+
| opcode 0x8c |  base[4-0] | rt[4-0] |             offset[15-0]              |
+-------------+------------+---------+---------------------------------------+
  LoadWord Relative to Address in General Purpose Register
    lw %rt, offset(%base)
	%rt:	GPR Target Register (0...31)
	%base:	GPR, specifies Source Address Base
	offset:	signed Offset added to Source Address Base
    %rt <- word_at_address (offset + %base)
*/
#define lw(rt,offset,base) \
	(0x8c000000 | ((base) << 21) | ((rt) << 16) | ((offset) & 0xffff))
/*
+-------------+------------+---------+---------------------------------------+
|31         26|25        21|20     16|15                                   0 |
+-------------+------------+---------+---------------------------------------+
| opcode 0xac |  base[4-0] | rt[4-0] |             offset[15-0]              |
+-------------+------------+---------+---------------------------------------+
  StoreWord Relative to Address in General Purpose Register
    sw %rt, offset(%base)
	%rt:	GPR Target Register (0...31)
	%base:	GPR, specifies Source Address Base
	offset:	signed Offset added to Source Address Base
    word_at_address (offset + %base) <- %rt
*/
#define sw(rt,offset,base) \
	(0xac000000 | ((base) << 21) | ((rt) << 16) | ((offset) & 0xffff))
/*
+-------------+------------+---------+---------------------------------------+
|31         26|25        21|20     16|15                                   0 |
+-------------+------------+---------+---------------------------------------+
| opcode 0x42 |   rs[4-0]  | rt[4-0] |              immediate                |
+-------------+------------+---------+---------------------------------------+
  Add Immediate Unsigned Word
    addiu %rt, %rs, immediate
	%rt:		GPR Target Register (0...31)
	%rs:		GPR Source Register (0...31)
	immediate:	value added to Source Register
    %rt <- %rs + sign_extended(immediate)
*/
#define addiu(rt,rs,immediate) \
	(0x24000000 | ((rs) << 21) | ((rt) << 16) | ((immediate) & 0xffff))
/*
+-------------+-----------+---------+----------------------------+-----+-----+
|31         26|25       21|20     16|15                        2 |  1  |  0  |
+-------------+-----------+---------+----------------------------+-----+-----+
| opcode 0xd8 | base[4-0] | vt[4-0] |         offset[15-2]       |  0  |vt[5]|
+-------------+-----------+---------+----------------------------+-----+-----+
  LoadVector.Quadword Relative to Address in General Purpose Register
  Final Address needs to be 64-byte aligned.
    lv.q %vfpu_rt, offset(%base)
	%fpu_rt:	VFPU Vector Target Register (column0-31/row32-63)
	%base:		GPR, specifies Source Address Base
	offset:		signed Offset added to Source Address Base
    fpu_vtr <- vector_at_address (offset + %gpr)
*/
#define lv_q(vfpu_rt,offset,base,cache_policy)		        \
        (0xd8000000 |                				\
	 ((base) << 21) |					\
	 (((vfpu_rt) & 0x1f) << 16) | ((vfpu_vtreg) >> 4) |	\
	 ((offset) << 2) |					\
	 ((cache_policy) << 1))
/*
+-------------+-----------+---------+----------------------------+-----+-----+
|31         26|25       21|20     16|15                        2 |  1  |  0  |
+-------------+-----------+---------+----------------------------+-----+-----+
| opcode 0xf8 | base[4-0] | vt[4-0] |         offset[15-2]       | c_p |vt[5]|
+-------------+-----------+---------+----------------------------+-----+-----+
  StoreVector.Quadword Relative to Address in General Purpose Register
  Final Address needs to be 64-byte aligned.
    sv.q %vfpu_rt, offset(%base), cache_policy
	%fpu_rt:	VFPU Vector Target Register (column0-31/row32-63)
	%base:		specifies Source Address Base
	offset:		signed Offset added to Source Address Base
	cache_policy:	0 = write-through, 1 = write-back
    vector_at_address (offset + %gpr) <- fpu_vtr
*/
#define sv_q(vfpu_rt,offset,base,cache_policy)			\
	(0xf8000000 |						\
	 ((base) << 21) |					\
	 (((vfpu_rt) & 0x1f) << 16) | ((vfpu_rt) >> 4) |	\
	 ((offset) << 2) |					\
	 ((cache_policy) << 1))
/*
+-------------------------------------------------------------+--------------+
|31                                                         7 | 6         0  |
+-------------------------------------------------------------+--------------+
|              opcode 0xd0060000                              | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
  SetVectorZero.Single/Pair/Triple/Quad
    vzero.s %vfpu_rt	; Set 1 Vector Component to 0.0f
    vzero.p %vfpu_rt	; Set 2 Vector Components to 0.0f
    vzero.t %vfpu_rt	; Set 3 Vector Components to 0.0f
    vzero.q %vfpu_rt	; Set 4 Vector Components to 0.0f
	%vfpu_rt:	VFPU Vector Target Register ([s|p|t|q]reg 0..127)
    vfpu_regs[%vfpu_rt] <- 0.0f
*/
#define vzero_s(vfpu_rt)  (0xd0060000 | (vfpu_rt))
#define vzero_p(vfpu_rt)  (0xd0060080 | (vfpu_rt))
#define vzero_t(vfpu_rt)  (0xd0068000 | (vfpu_rt))
#define vzero_q(vfpu_rt)  (0xd0068080 | (vfpu_rt))
/*
+-------------------------------------------------------------+--------------+
|31                                                         7 | 6         0  |
+-------------------------------------------------------------+--------------+
|              opcode 0xd0070000                              | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
  SetVectorOne.Single/Pair/Triple/Quad
    vone.s %vfpu_rt	; Set 1 Vector Component to 1.0f
    vone.p %vfpu_rt	; Set 2 Vector Components to 1.0f
    vone.t %vfpu_rt	; Set 3 Vector Components to 1.0f
    vone.q %vfpu_rt	; Set 4 Vector Components to 1.0f
	%vfpu_rt:	VFPU Vector Target Register ([s|p|t|q]reg 0..127)
    vfpu_regs[%vfpu_rt] <- 0.0f
*/
#define vone_s(vfpu_rt)  (0xd0070000 | (vfpu_rt))
#define vone_p(vfpu_rt)  (0xd0070080 | (vfpu_rt))
#define vone_t(vfpu_rt)  (0xd0078000 | (vfpu_rt))
#define vone_q(vfpu_rt)  (0xd0078080 | (vfpu_rt))
/*
+-------------------------------------------------------------+--------------+
|31                                                         7 | 6         0  |
+-------------------------------------------------------------+--------------+
|              opcode 0xf3868080                              | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
  SetMatrixZero.Single/Pair/Triple/Quad
    vmzero.p %vfpu_rt	; Set 2x2 Submatrix to 0.0f
    vmzero.t %vfpu_rt	; Set 3x3 Submatrix to 0.0f
    vmzero.q %vfpu_rt	; Set 4x4 Matrix to 0.0f
	%vfpu_rt:	VFPU Matrix Target Register ([s|p|t|q]reg 0..127)
    vfpu_mtx[%vfpu_rt] <- 0.0f
*/
#define vmzero_p(vfpu_rt)  (0xf3860080 | (vfpu_rt))
#define vmzero_t(vfpu_rt)  (0xf3868000 | (vfpu_rt))
#define vmzero_q(vfpu_rt)  (0xf3868080 | (vfpu_rt))
/*
+-------------------------------------------------------------+--------------+
|31                                                         7 | 6         0  |
+-------------------------------------------------------------+--------------+
|              opcode 0xf3838080                              | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
    vmidt.p %vfpu_rt	; Set 2x2 Submatrix to Identity
    vmidt.t %vfpu_rt	; Set 3x3 Submatrix to Identity
    vmidt.q %vfpu_rt	; Set 4x4 Matrix to Identity
	%vfpu_rt:	VFPU Matrix Target Register ([s|p|t|q]reg 0..127)
    vfpu_mtx[%vfpu_rt] <- identity matrix
*/
#define vmidt_p(vfpu_rt)  (0xf3830080 | (vfpu_rt))
#define vmidt_t(vfpu_rt)  (0xf3838000 | (vfpu_rt))
#define vmidt_q(vfpu_rt)  (0xf3838080 | (vfpu_rt))
/* helpers for direct __asm__ use: */
#define _cgen_stringify(x)  #x
#define cgen_stringify(x)   _cgen_stringify(x)
#define cgen_asm(x)         ".loc 1 " cgen_stringify(__LINE__) " 0\n\t.word " cgen_stringify(x) "\n\t"
#endif
main.c:
Code: Select all
#include <pspkernel.h>
#include <pspdebug.h>
#include <pspctrl.h>
#include <pspdisplay.h>
#include "codegen.h"
/* XXX SDK BUG: In theory everything should work when main is running in userspace.
   Unfortunately the PSP hangs if we register the exception handler in the _init constructor, so we need to
   call pspDebugInstallErrorHandler() in main().
 */
PSP_MAIN_THREAD_ATTR(/*PSP_THREAD_ATTR_USER |*/ PSP_THREAD_ATTR_VFPU);
PSP_MODULE_INFO("VFPU-test", 0x1000, 1, 1);
static int exit_callback(int arg1, int arg2, void *common)
{
	sceKernelExitGame();
	return 0;
}
static int callback_thread (SceSize args, void *argp)
{
	int cbid = sceKernelCreateCallback("Exit Callback", exit_callback, NULL);
	sceKernelRegisterExitCallback(cbid);
	sceKernelSleepThreadCB();
	return 0;
}
/* Sets up the callback thread and returns its thread id */
static void setup_callbacks (void) __attribute__((constructor));
static void setup_callbacks (void)
{
	int thid = sceKernelCreateThread("update_thread", callback_thread, 0x11, 0xFA0, THREAD_ATTR_USER, 0);
	if (thid >= 0)
		sceKernelStartThread(thid, 0, 0);
}
static void back_to_kernel (void) __attribute__((destructor));
static void back_to_kernel (void)
{
	sceKernelExitGame();
}
static void exception_handler (PspDebugRegBlock *regs)
{
	pspDebugScreenInit();
	pspDebugScreenSetBackColor(0x00FF0000);
	pspDebugScreenSetTextColor(0xFFFFFFFF);
	pspDebugScreenClear();
	pspDebugScreenPrintf("Exception Details:\n");
	pspDebugDumpException(regs);
}
void vfpu_init (void)
{
        __asm__ volatile (
		cgen_asm(vmzero_q(Q_M000))	/* access register array as matrices for speed */
		cgen_asm(vmzero_q(Q_M100))
		cgen_asm(vmzero_q(Q_M200))
		cgen_asm(vmzero_q(Q_M300))
		cgen_asm(vmzero_q(Q_M400))
		cgen_asm(vmzero_q(Q_M500))
		cgen_asm(vmzero_q(Q_M600))
		cgen_asm(vmzero_q(Q_M700))
	);
}
void vfpu_save_regs (float vfpu_regs [32][4])
{
        register void *ptr __asm__ ("a0") = vfpu_regs;
        __asm__ volatile (
		cgen_asm(sv_q(0, 0 * 4, R_a0, 0))
		cgen_asm(sv_q(1, 1 * 4, R_a0, 0))
		cgen_asm(sv_q(2, 2 * 4, R_a0, 0))
                cgen_asm(sv_q(3, 3 * 4, R_a0, 0))
                cgen_asm(sv_q(4, 4 * 4, R_a0, 0))
                cgen_asm(sv_q(5, 5 * 4, R_a0, 0))
                cgen_asm(sv_q(6, 6 * 4, R_a0, 0))
                cgen_asm(sv_q(7, 7 * 4, R_a0, 0))
                cgen_asm(sv_q(8, 8 * 4, R_a0, 0))
                cgen_asm(sv_q(9, 9 * 4, R_a0, 0))
                cgen_asm(sv_q(10, 10 * 4, R_a0, 0))
                cgen_asm(sv_q(11, 11 * 4, R_a0, 0))
                cgen_asm(sv_q(12, 12 * 4, R_a0, 0))
                cgen_asm(sv_q(13, 13 * 4, R_a0, 0))
                cgen_asm(sv_q(14, 14 * 4, R_a0, 0))
                cgen_asm(sv_q(15, 15 * 4, R_a0, 0))
                cgen_asm(sv_q(16, 16 * 4, R_a0, 0))
                cgen_asm(sv_q(17, 17 * 4, R_a0, 0))
                cgen_asm(sv_q(18, 18 * 4, R_a0, 0))
                cgen_asm(sv_q(19, 19 * 4, R_a0, 0))
                cgen_asm(sv_q(20, 20 * 4, R_a0, 0))
                cgen_asm(sv_q(21, 21 * 4, R_a0, 0))
                cgen_asm(sv_q(22, 22 * 4, R_a0, 0))
                cgen_asm(sv_q(23, 23 * 4, R_a0, 0))
                cgen_asm(sv_q(24, 24 * 4, R_a0, 0))
                cgen_asm(sv_q(25, 25 * 4, R_a0, 0))
                cgen_asm(sv_q(26, 26 * 4, R_a0, 0))
                cgen_asm(sv_q(27, 27 * 4, R_a0, 0))
                cgen_asm(sv_q(28, 28 * 4, R_a0, 0))
                cgen_asm(sv_q(29, 29 * 4, R_a0, 0))
                cgen_asm(sv_q(30, 30 * 4, R_a0, 0))
                cgen_asm(sv_q(31, 31 * 4, R_a0, 0))
		: "=r"(ptr) : "r"(ptr) : "memory");
}
void vfpu_diff (float r1 [32][4], float r2 [32][4])
{
        int i, j;
        for (i=0; i<32; i++) {
                for (j=0; j<4; j++) {
			if (r1[i][j] != r2[i][j])
				break;
		}
		if (j<4)
			pspDebugScreenPrintf("- %i: % 5.5f % 5.5f % 5.5f % 5.5f\n",
						i, r1[i][0], r1[i][1], r1[i][2], r1[i][3]);
	}
        for (i=0; i<32; i++) {
                for (j=0; j<4; j++) {
			if (r1[i][j] != r2[i][j])
				break;
		}
		if (j<4)
			pspDebugScreenPrintf("+ %i: % 5.5f % 5.5f % 5.5f % 5.5f\n",
						i, r2[i][0], r2[i][1], r2[i][2], r2[i][3]);
	}
}
static float vfpu_regs0 [32][4] __attribute__((aligned(64)));
static float vfpu_regs1 [32][4] __attribute__((aligned(64)));
/**
 *  ok... this function is the place to actually try the behaviour of some yet-unknown instructions.
 */
void vfpu_testcase (void)
{
	__asm__(cgen_asm(vmidt_q(Q_M100)));
}
int main (int argc, char **argv)
{
	pspDebugInstallErrorHandler(exception_handler);
	sceCtrlSetSamplingCycle(0);
	sceCtrlSetSamplingMode(PSP_CTRL_MODE_DIGITAL);
	pspDebugScreenInit();
	pspDebugScreenPrintf("VFPU test  --  vfpu_regs0 = %p, vfpu_regs1 = %p\n\n", vfpu_regs0, vfpu_regs1);
	pspDebugScreenPrintf("press O to run VFPU testcase or X to trap into breakpoint\n\n");
	vfpu_init();
	while (1) {
		SceCtrlData pad;
		sceCtrlReadBufferPositive(&pad, 1);
		if (pad.Buttons & PSP_CTRL_CIRCLE) {
			vfpu_save_regs(vfpu_regs0);
			vfpu_testcase();
			vfpu_save_regs(vfpu_regs1);
			vfpu_diff(vfpu_regs0, vfpu_regs1);
		}
		if (pad.Buttons & PSP_CTRL_CROSS)
			asm("break\n"); /* Cause a break exception, to check that the exception handler works... */
		sceDisplayWaitVblankStart();
	}
	return 0;
}

