Better I/O performances with a different BUFSIZ

Discuss the development of new homebrew software, tools and libraries.

Moderators: cheriff, TyRaNiD

Post Reply
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Better I/O performances with a different BUFSIZ

Post by Beuc »

Hi,

I saw that the memory stick access is pretty unefficient when using fread(3).

From what I can see this is because BUFSIZ is set to 1024 in the psptoolchain, and it could be improved to up to 8x.

How about using a BUFSIZ value like 65k to (dramatically) improve Memory Stick perfs?


I did the following tests with read(2):

Code: Select all

test_bufsiz=     512 - read(2): 31493ms (22048501B, 700.11B/s)
test_bufsiz=    1024 - read(2): 16443ms (22048501B, 1340.91B/s)
test_bufsiz=    2048 - read(2):  8964ms (22048501B, 2459.67B/s)
test_bufsiz=    4096 - read(2):  5457ms (22048501B, 4040.41B/s)
test_bufsiz=    8192 - read(2):  3703ms (22048501B, 5954.23B/s)
test_bufsiz=   16384 - read(2):  2842ms (22048501B, 7758.09B/s)
test_bufsiz=   32768 - read(2):  2391ms (22048501B, 9221.46B/s)
test_bufsiz=   65536 - read(2):  2162ms (22048501B, 10198.20B/s)
test_bufsiz=  131072 - read(2):  2126ms (22048501B, 10370.88B/s)
test_bufsiz=  262144 - read(2):  2113ms (22048501B, 10434.69B/s)
test_bufsiz=  524288 - read(2):  2104ms (22048501B, 10479.33B/s)
test_bufsiz= 1048576 - read(2):  2099ms (22048501B, 10504.29B/s)
test_bufsiz= 2097152 - read(2):  2097ms (22048501B, 10514.31B/s)
test_bufsiz= 4194304 - read(2):  2096ms (22048501B, 10519.32B/s)
test_bufsiz= 8388608 - read(2):  2094ms (22048501B, 10529.37B/s)
test_bufsiz=16777216 - read(2):  2095ms (22048501B, 10524.34B/s)
Source code:

Code: Select all

#include <stdio.h>
#include <stdlib.h>

#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <SDL/SDL.h>

#include <pspmoduleinfo.h>
PSP_HEAP_SIZE_MAX&#40;&#41;;

int main&#40;&#41;
&#123; 
  SDL_Init&#40;SDL_INIT_EVERYTHING&#41;;
  Uint32 start;
  printf&#40;"I/O perf test\n"&#41;;

  int i;
  for &#40;i = 9; i <= 24; i++&#41;
    &#123;
      int test_bufsiz = 1<<i;
      char* buf = malloc&#40;test_bufsiz&#41;;
      if &#40;buf == NULL&#41; printf&#40;"out of memory\n"&#41;, exit&#40;0&#41;;
      printf&#40;"test_bufsiz=%8d -", test_bufsiz&#41;;

      start = SDL_GetTicks&#40;&#41;;
      int nb_read = 0;
      int total = 0;
      int din = open&#40;"ms0&#58;/340.PBP", 0&#41;;
      while &#40;&#40;nb_read = read&#40;din, buf, test_bufsiz&#41;&#41; > 0&#41; total += nb_read;
      int t = SDL_GetTicks&#40;&#41; - start;
      printf&#40;" read&#40;2&#41;&#58; %5dms &#40;%8dB, %.2fB/s&#41;", t, total, 1.0*total/t&#41;;
      close&#40;din&#41;;

      printf&#40;"\n"&#41;;
      free&#40;buf&#41;;
    &#125;

  SDL_Quit&#40;&#41;;
  return 0;
&#125;
We can compare with fread, which internally uses 1024:

Code: Select all

test_bufsiz=     512 - fread&#40;3&#41;&#58; 28748ms &#40;22048256B, 766.95B/s&#41;
test_bufsiz=    1024 - fread&#40;3&#41;&#58; 28704ms &#40;22047744B, 768.11B/s&#41;
test_bufsiz=    2048 - fread&#40;3&#41;&#58; 28348ms &#40;22046720B, 777.72B/s&#41;
test_bufsiz=    4096 - fread&#40;3&#41;&#58; 28681ms &#40;22044672B, 768.62B/s&#41;
test_bufsiz=    8192 - fread&#40;3&#41;&#58; 28712ms &#40;22044672B, 767.79B/s&#41;
test_bufsiz=   16384 - fread&#40;3&#41;&#58; 28376ms &#40;22036480B, 776.59B/s&#41;
test_bufsiz=   32768 - fread&#40;3&#41;&#58; 28742ms &#40;22020096B, 766.13B/s&#41;
test_bufsiz=   65536 - fread&#40;3&#41;&#58; 28725ms &#40;22020096B, 766.58B/s&#41;
test_bufsiz=  131072 - fread&#40;3&#41;&#58; 28379ms &#40;22020096B, 775.93B/s&#41;
test_bufsiz=  262144 - fread&#40;3&#41;&#58; 28741ms &#40;22020096B, 766.16B/s&#41;
test_bufsiz=  524288 - fread&#40;3&#41;&#58; 28722ms &#40;22020096B, 766.66B/s&#41;
test_bufsiz= 1048576 - fread&#40;3&#41;&#58; 28372ms &#40;22020096B, 776.12B/s&#41;
test_bufsiz= 2097152 - fread&#40;3&#41;&#58; 28735ms &#40;20971520B, 729.82B/s&#41;
test_bufsiz= 4194304 - fread&#40;3&#41;&#58; 28726ms &#40;20971520B, 730.05B/s&#41;
test_bufsiz= 8388608 - fread&#40;3&#41;&#58; 28367ms &#40;16777216B, 591.43B/s&#41;
test_bufsiz=16777216 - fread&#40;3&#41;&#58; 28725ms &#40;16777216B, 584.06B/s&#41;
(The byte counts is not accurate because fread(3) only returns "I got it all" or "I didn't got it all", without details on how much data was read in a partial read, but the timing data remains right.)

Source code:

Code: Select all

#include <stdio.h>
#include <stdlib.h>

#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <SDL/SDL.h>

#include <pspmoduleinfo.h>
PSP_HEAP_SIZE_MAX&#40;&#41;;

int main&#40;&#41;
&#123; 
  SDL_Init&#40;SDL_INIT_EVERYTHING&#41;;
  Uint32 start;
  printf&#40;"I/O perf test\n"&#41;;

  int i;
  for &#40;i = 9; i <= 24; i++&#41;
    &#123;
      int test_bufsiz = 1<<i;
      char* buf = malloc&#40;test_bufsiz&#41;;
      if &#40;buf == NULL&#41; printf&#40;"out of memory\n"&#41;, exit&#40;0&#41;;
      printf&#40;"test_bufsiz=%8d -", test_bufsiz&#41;;

      start = SDL_GetTicks&#40;&#41;;
      int total2 = 0;
      FILE* fin = fopen&#40;"ms0&#58;/340.PBP", "rb"&#41;;
      while &#40;fread&#40;buf, test_bufsiz, 1, fin&#41;&#41; total2 += test_bufsiz;
      int t = SDL_GetTicks&#40;&#41; - start;
      printf&#40;" fread&#40;3&#41;&#58; %5dms &#40;%8dB, %.2fB/s&#41;", t, total2, 1.0*total2/t&#41;;
      fclose&#40;fin&#41;;

      printf&#40;"\n"&#41;;
      free&#40;buf&#41;;
    &#125;

  SDL_Quit&#40;&#41;;
  return 0;
&#125;
jimparis
Posts: 1145
Joined: Fri Jun 10, 2005 4:21 am
Location: Boston

Post by jimparis »

sure, if you could write a patch I'll apply it.
TyRaNiD
Posts: 907
Joined: Sun Jan 18, 2004 12:23 am

Post by TyRaNiD »

Urm be careful though (I haven't checked where the buffer is defined) but there is always the risk that increasing the size could make existing code fail due to higher memory requirements (well when recompiled against a new libc).

If it is a static buffer (I guess likely) then you could blow away almost an extra meg of ram if someone opened as many MS files as they can. If it is stack based there is the risk of stack overflow which are really bastards to track down on the PSP.

As I have always tried to point out, libc is there more for glue code to make porting easier than something which you should rely on in a memory limited embedded system.
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Post by Beuc »

jimparis wrote:sure, if you could write a patch I'll apply it.
I'll give this a try!
TyRaNiD wrote:Urm be careful though (I haven't checked where the buffer is defined) but there is always the risk that increasing the size could make existing code fail due to higher memory requirements (well when recompiled against a new libc).

If it is a static buffer (I guess likely) then you could blow away almost an extra meg of ram if someone opened as many MS files as they can. If it is stack based there is the risk of stack overflow which are really bastards to track down on the PSP.

As I have always tried to point out, libc is there more for glue code to make porting easier than something which you should rely on in a memory limited embedded system.
Yes, a too big buffer sound like a bad idea. Plus it's only 8192k in GNU/Linux (though that's an efficient value there) - better remain on the same order of magnitude.

Still, fread(3) is used by lots of games, if only through SDL (and SDL_rwops.c), so it's worth the trouble :)
J.F.
Posts: 2906
Joined: Sun Feb 22, 2004 11:41 am

Post by J.F. »

I'd suggest something like the PSP_HEAP_SIZE_XXX defines... make a weak variable called something like "_fread_bufsize" that defaults to 1024 when not defined, but otherwise takes the defined value. That way, someone who's aware of the memory constraint and wants the fastest speed can add "PSP_FREAD_BUFSIZE(65536);" to the application to set the buffer size explicitly.

Look at the sbrk code in newlib to see how the heap size is handled.
coolkehon
Posts: 355
Joined: Mon Oct 20, 2008 5:44 am

Post by coolkehon »

this sound like a good idea
note: just posting so that i will get notified by email because i dont know how to watch this topic =)
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Post by Beuc »

J.F. wrote:I'd suggest something like the PSP_HEAP_SIZE_XXX defines... make a weak variable called something like "_fread_bufsize" that defaults to 1024 when not defined, but otherwise takes the defined value. That way, someone who's aware of the memory constraint and wants the fastest speed can add "PSP_FREAD_BUFSIZE(65536);" to the application to set the buffer size explicitly.

Look at the sbrk code in newlib to see how the heap size is handled.
Well, BUFSIZ is used in a lot of statically allocated variables, including outside of newlib, so I think it needs to stay constant.

Maybe we can use this technique to change the FILE* internal buffer size though.
(apparently this happens in newlib/libc/stdio/makebuf.c:109)
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Post by Beuc »

Beuc wrote:Maybe we can use this technique to change the FILE* internal buffer size though.
(apparently this happens in newlib/libc/stdio/makebuf.c:109)
I tried the following patch:

Code: Select all

--- newlib-1.16.0.bak/newlib/libc/stdio/makebuf.c	2007-05-02 01&#58;03&#58;36.000000000 +0200
+++ newlib-1.16.0-psp/newlib/libc/stdio/makebuf.c	2009-07-06 23&#58;56&#58;19.000000000 +0200
@@ -74,8 +74,12 @@
 #ifdef HAVE_BLKSIZE
       size = st.st_blksize <= 0 ? BUFSIZ &#58; st.st_blksize;
 #else
+#if defined&#40;__psp__&#41;
+      size = 65536;
+#else
       size = BUFSIZ;
 #endif
+#endif
       /*
        * Optimize fseek&#40;&#41; only if it is a regular file.
        * &#40;The test for __sseek is mainly paranoia.&#41;
Here's the results of the fread(3) loop with different values:

Code: Select all

newlib size =    1024 - fread&#40;3&#41;&#58; 26929ms &#40;current&#41;
newlib size =    4096 - fread&#40;3&#41;&#58; 11260ms
newlib size =   32768 - fread&#40;3&#41;&#58;  3467ms
newlib size =   65536 - fread&#40;3&#41;&#58;  2885ms
newlib size =  131072 - fread&#40;3&#41;&#58;  2595ms
newlib size = 1048576 - fread&#40;3&#41;&#58;  2367ms
It doesn't matter what test_bufsiz you use, the perfs are the same with +/- 50ms.
(btw perfs are slightly better than in my first post because I rebooted from fw 1.50 to 5.00 during the afternoon - 65k perfs for fw 1.50 are around 2925ms).

This looks promising :)

Also:
TyRaNiD wrote:If it is a static buffer (I guess likely) then you could blow away almost an extra meg of ram if someone opened as many MS files as they can. If it is stack based there is the risk of stack overflow which are really bastards to track down on the PSP.
AFAICS it's a malloc:

Code: Select all

  if &#40;&#40;p = _malloc_r &#40;ptr, size&#41;&#41; == NULL&#41;
I also see that the PSP crash when open more than 10 files at once. So the increased memory usage is at most (65k-1k)*10 = 640k.


We need to perform some _write_ test now.
If write performances are not harmed I think this patch is enough. BUFSIZ is not changed, and still does not represent an efficient buffer size, but there may be less issues with only altering 'size'.

What do you think?
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Post by Beuc »

Hmmm, BAD news.

I tried recompiling the game port I'm working on (GNU FreeDink), and apparently this patch dramatically _harms_ the loading time.

I suppose that when accessing lots of small files or when doing a decent number of fseek()s, reading big chunks (with a 65k buffer) harm performances (like 4x slower) more than when reading small chunks (with a 1k buffer).

Ah well..
J.F.
Posts: 2906
Joined: Sun Feb 22, 2004 11:41 am

Post by J.F. »

Beuc wrote:Hmmm, BAD news.

I tried recompiling the game port I'm working on (GNU FreeDink), and apparently this patch dramatically _harms_ the loading time.

I suppose that when accessing lots of small files or when doing a decent number of fseek()s, reading big chunks (with a 65k buffer) harm performances (like 4x slower) more than when reading small chunks (with a 1k buffer).

Ah well..
I suppose fread needs to be made smarter - only reading what is asked for on the first read, but reading larger amounts if the next (one or two or more) freads are all sequential. That way sequential reads are sped up, while small random reads aren't slowed by reading large amounts that won't be used.
TyRaNiD
Posts: 907
Joined: Sun Jan 18, 2004 12:23 am

Post by TyRaNiD »

640K is still quite a sizable increase :)
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Post by Beuc »

AFAICS access time for small files doesn't change, this only impacts smalls reads in larger files:

Code: Select all

test_bufsiz=    1024 - read&#40;2&#41;&#58;   401ms - lseek&#40;2&#41;&#58;   467ms
test_bufsiz=    2048 - read&#40;2&#41;&#58;   397ms - lseek&#40;2&#41;&#58;   444ms
test_bufsiz=    4096 - read&#40;2&#41;&#58;   397ms - lseek&#40;2&#41;&#58;   803ms
test_bufsiz=    8192 - read&#40;2&#41;&#58;   398ms - lseek&#40;2&#41;&#58;   766ms
test_bufsiz=   16384 - read&#40;2&#41;&#58;   397ms - lseek&#40;2&#41;&#58;   846ms
test_bufsiz=   32768 - read&#40;2&#41;&#58;   398ms - lseek&#40;2&#41;&#58;   995ms
test_bufsiz=   65536 - read&#40;2&#41;&#58;   397ms - lseek&#40;2&#41;&#58;  1425ms
test_bufsiz=  131072 - read&#40;2&#41;&#58;   399ms - lseek&#40;2&#41;&#58;  2069ms
test_bufsiz=  262144 - read&#40;2&#41;&#58;   398ms - lseek&#40;2&#41;&#58;  3320ms
test_bufsiz=  524288 - read&#40;2&#41;&#58;   397ms - lseek&#40;2&#41;&#58;  5827ms
test_bufsiz= 1048576 - read&#40;2&#41;&#58;   397ms - lseek&#40;2&#41;&#58; 10873ms
Source code:

Code: Select all

#include <stdio.h>
#include <stdlib.h>

#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <SDL/SDL.h>

#include <pspmoduleinfo.h>
PSP_HEAP_SIZE_MAX&#40;&#41;;

int main&#40;&#41;
&#123; 
  SDL_Init&#40;SDL_INIT_EVERYTHING&#41;;
  Uint32 start;
  printf&#40;"I/O perf test\n"&#41;;

  int i;
  for &#40;i = 10; i <= 20; i++&#41;
    &#123;
      int n;
      int test_bufsiz = 1<<i;
      char* buf = malloc&#40;test_bufsiz&#41;;
      if &#40;buf == NULL&#41; printf&#40;"out of memory\n"&#41;, exit&#40;0&#41;;
      printf&#40;"test_bufsiz=%8d", test_bufsiz&#41;;

      start = SDL_GetTicks&#40;&#41;;
      for &#40;n = 0; n < 100; n++&#41;
	&#123;
	  int nb_read = 0;
	  int total = 0;
	  int din = open&#40;"ms0&#58;/batser.bin", 0&#41;;
	  if &#40;din < 0&#41; perror&#40;"open"&#41;, exit&#40;0&#41;;
	  while &#40;&#40;nb_read = read&#40;din, buf, test_bufsiz&#41;&#41; > 0&#41; total += nb_read;
	  close&#40;din&#41;;
	&#125;
      printf&#40;" - read&#40;2&#41;&#58; %5dms", SDL_GetTicks&#40;&#41; - start&#41;;

      start = SDL_GetTicks&#40;&#41;;
      int din = open&#40;"ms0&#58;/500.PBP", 0&#41;;
      if &#40;din < 0&#41; perror&#40;"open"&#41;, exit&#40;0&#41;;
      for &#40;n = 0; n < 100; n++&#41;
	&#123;
	  int nb_read = 0;
	  int total = 0;
	  if &#40;lseek&#40;din, 3500000, SEEK_SET&#41; < 0&#41;
	    perror&#40;"lseek"&#41;, exit&#40;0&#41;;
	  do &#123;
	    nb_read = read&#40;din, buf, test_bufsiz&#41;;
	    total += nb_read;
	  &#125; while &#40;total < 5000&#41;;
	&#125;
      close&#40;din&#41;;
      printf&#40;" - lseek&#40;2&#41;&#58; %5dms", SDL_GetTicks&#40;&#41; - start&#41;;

      printf&#40;"\n"&#41;;
      free&#40;buf&#41;;
    &#125;

  SDL_Quit&#40;&#41;;
  return 0;
&#125;
User avatar
Torch
Posts: 825
Joined: Wed May 28, 2008 2:50 am

Post by Torch »

How do the sceIO* functions handle things? Does it have another buffer or does it DMA straight into your pointer?
Beuc
Posts: 33
Joined: Thu Mar 26, 2009 5:04 am
Location: holland

Post by Beuc »

Hi!
Torch wrote:How do the sceIO* functions handle things? Does it have another buffer or does it DMA straight into your pointer?
AFAICS the only software buffer is in fread(3). fread(3) itself calls read(2) which calls sceIoRead, with some reentrant (*_r) steps in-between.

The tests suggests that the PSP kernel doesn't manage cache for MS access.
Post Reply