gbadev.org forum archive

This is a read-only mirror of the content originally found on forum.gbadev.org (now offline), salvaged from Wayback machine copies. A new forum can be found here.

ASM > Division in THUMB mode?

#69613 - Janekxx - Tue Jan 31, 2006 8:28 pm

Hi, maybe some of You have fast division routine written THUMB mode, or maybe some C optimized one?

Please help me if You can.

Currenty I'm using this, but it's slooow:
Code:
#define divnorm(num, den, sign)                 \
{                                               \
  if (num < 0)                                  \
    {                                           \
      num = -num;                               \
      sign = 1;                                 \
    }                                           \
  else                                          \
    {                                           \
      sign = 0;                                 \
    }                                           \
                                                \
  if (den < 0)                                  \
    {                                           \
      den = - den;                              \
      sign = 1 - sign;                          \
    }                                           \
}


static unsigned long
divmodsi4(int modwanted, unsigned long num, unsigned long den) 
{                                               
  long int bit = 1;                             
  long int res = 0;                             
  long prevden;
  while (den < num && bit && !(den & (1L<<31)))                       
    {
      den <<=1;                                 
      bit <<=1;                                 
    }                                           
  while (bit)
    {                                   
      if (num >= den)
        {                               
          num -= den;                           
          res |= bit;                           
        }                                               
      bit >>=1;                                 
      den >>=1;                                 
    }                                           
  if (modwanted) return num;
  return res;
}


#define exitdiv(sign, res) if (sign) { res = - res;} return res;

long
__modsi3 (long numerator, long denominator)
{
  int sign = 0;
  long dividend;
  long modul;


  if (numerator < 0)
    {
      numerator = -numerator;
      sign = 1;
    }
  if (denominator < 0)
    {
      denominator = -denominator;
    } 
 
  modul =  divmodsi4 (1, numerator, denominator);
  if (sign)
    return - modul;
  return modul;
}


long
__divsi3 (long numerator, long denominator)
{
  int sign;
  long dividend;
  long modul;
  divnorm (numerator, denominator, sign);

  dividend = divmodsi4 (0,  numerator, denominator);
  exitdiv (sign, dividend);
}

long
__umodsi3 (unsigned long numerator, unsigned long denominator)
{
  long dividend;
  long modul;

modul= divmodsi4 (1,  numerator, denominator);
  return modul;
}

long
__udivsi3 (unsigned long numerator, unsigned long denominator)
{
  int sign;
  long dividend;
  long modul;
  dividend =   divmodsi4 (0, numerator, denominator);
  return dividend;
}

_________________
sony.int.pl

#69615 - DekuTree64 - Tue Jan 31, 2006 8:48 pm

You'll be much better off switching to ARM mode than dividing in THUMB. Can be done of course, but will be slow as dirt due to all the conditionals involved. In ARM, a lot of those become single conditional instructions, rather than conditional jumps over instructions.

Check out the 32/32=32 divide on this page. You'll have to add colons after all the labels to get it to assemble with gcc, but it's fast (for a division routine, anyway).

Or do you have a specific need to use THUMB?
_________________
___________
The best optimization is to do nothing at all.
Therefore a fully optimized program doesn't exist.
-Deku

#69649 - Janekxx - Tue Jan 31, 2006 11:01 pm

Thanks for reply, but I know that page :)

It's very important to me that routine will be in THUMB mode because of memory limitations (not GBA platform - old Sony Jxx phones).
_________________
sony.int.pl

#69653 - DekuTree64 - Tue Jan 31, 2006 11:27 pm

Ah, ok. Your C routine looks pretty good then. Here are a couple of micro-optimizations that may or may not help:

Code:
static unsigned long
divmodsi4(int modwanted, unsigned long num, unsigned long den) 
{                                               
  long int bit = 1;                             
  long int res = 0;                             
  long prevden;
     // Removed the && bit from this loop, since den & bit31
     // will happen at least just as soon (unless den is 0, which
     // is bad anyway, and could be checked for)
  while (!(den & (1L<<31)) && den < num)                       
    {
      bit <<=1;  // \_ Switched order of these. Maybe compiler
      den <<=1;  // /  will use bpl/bmi to check bit31 of den
    }                                           
  while (bit)
    {                                   
      if (num >= den)
        {                               
          num -= den;                           
          res |= bit;                           
        }                                               
      den >>=1;   // \_ Switched the order of these. If your compiler
      bit >>=1;   // /  is smart enough, it will use the flags from
    }             //    this shift to do the loop branch
  if (modwanted) return num;
  return res;
}


Maybe try checking the assembly output, and if it doesn't use those loop condition tricks I pointed out, then keep the assembly and make the changes yourself :)
_________________
___________
The best optimization is to do nothing at all.
Therefore a fully optimized program doesn't exist.
-Deku

#69658 - poslundc - Tue Jan 31, 2006 11:57 pm

How serious are your memory limitations? Some alternatives you might want to consider:

- ARM code might be faster and consume less memory for something like this, even if you have a 16-bit bus. (It'd take more knowledge of the target to say for sure.)

- Use a reciprocal LUT and keep the resolution relatively low if memory is at a real premium... use linear interpolation to get in-between values; it's not terribly accurate but might be good enough for your application. Might not be good with your memory constraints, but it's hard to beat for speed.

Dan.

#69797 - Janekxx - Wed Feb 01, 2006 11:37 pm

Thanks for C optimalizations I will use it.

I'm forced to use thumb: libs, etc. , everything for Sony Jxx is made in thumb, I try to force arm mode but always same result - phone reset.


I'm working on 3D engine, (curentlly ~17 fps with 200polygons object :) (slow drawing on phone))
_________________
sony.int.pl

#69888 - Janekxx - Thu Feb 02, 2006 8:06 pm

By sugestions of precalculated table I made folowing function for signed division, maybe will be usefull to someone:
Code:
int  div_table[]={65536,32768,21845,16384,13107,10923,9362,8192,7282, // pecalculated values table (1/x * 65536)
                  6554,5958,5461,5041,4681,4369,4096,3855,3641,3449,
                  3277,3121,2979,2849,2731,2621,2521,2427,2341,2260,
                  2185,2114,2048,1986,1928,1872,1820,1771,1725,1680,
                  1634,1598,1560,1524,1489,1456,1425,1394,1365,1337,
                  1311,1285,1260,1237,1214,1192,1170,1150,1130,1111,
                  1092,1074,1057,1040,1024,1008,993,978,964,950,
                  936,923,910,898,886,874,862,851,840,830,
                  819,809,799,790,780,771,762,753,745,736,
                  728,720,712,705,697,690,683};

int super_div(int a, int b){    // fast division from 1 to 96
    a *=div_table[b];
    return a>>16;
}

_________________
sony.int.pl

#80460 - keldon - Sat Apr 22, 2006 1:46 pm

There is another clever way used by Jeff for fast software divides and multiplies.


a / b == antiLog(log(a) - log(b));

So if you create a log and antilog table you get a fast multiply / divide.

#80487 - poslundc - Sat Apr 22, 2006 6:56 pm

I'd thought of using this identity for speedy division before, but it just seemed to be requiring twice the tables and three times the lookups in order to get what would probably be diminished accuracy than a simple reciprocal LUT. Is there a compelling reason to use the former instead of the latter?

Dan.

#80566 - keldon - Sun Apr 23, 2006 2:39 pm

To be honest I just found it interesting that it worked for divides too, and no it isn't really useful for the gba because you have the multiplies. It's only useful [i guess] when you don't have hardware multiplies.

#80815 - Miked0801 - Tue Apr 25, 2006 7:22 pm

Log/AntiLog stuff looks great on paper, but I've found the precision really suffers compared to other methods. It also requires 2 lookups instead of 1. It does allow for fast non-integer Pow(x) type functions though (When multiplying logs by scalers, it is the same as take the log number to the power of the scaler and dividing is the same as sqrt)