Skip to content

Commit 4b09364

Browse files
committed
bug fixes, optimizing, WIN32_API_DEBUG GCC, alloca/memcpy replced MSVC 32b
-changed profiling timing code to be cumulative and added CPU locking -added a bypass of looking inside cv on ->Call() by making cv a tagged ptr -an optimized MASM inline asbsembly replaced alloca and memcpy on x86-32 -returning a MI64 obj became branchless between signed and unsigned MI64s -in commit "further optimizations in incoming arg loop, TARG usage, no memcpy" a loop overflow was introduced, depending on uninit memory, this could lead to a junk SV * on Perl stack being accessed -in commit "further optimizations in incoming arg loop, TARG usage, no memcpy", a bug that make native unsigned I64s signed scalars, has been fixed -in call_i686.h, a bug with T_INTEGER not being in a switch under WIN32_API_DEBUG was fixed, originally T_INTEGER was changed at Call() runtime to another T_ code in Call(), but that changing of T_ code was removed as an optimization -some compiler warnings silenced -random optimizing -see Changes
1 parent 41712f8 commit 4b09364

File tree

11 files changed

+353
-161
lines changed

11 files changed

+353
-161
lines changed

API.h

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,15 @@
3838
#define DISABLE_T_L_CALLS STMT_START { 0; } STMT_END
3939
#endif
4040

41-
/*never use this on cygwin, the debug messages in Call_asm leave the printf
42-
string args on the c stack and the target C func sees printf args, I (bulk88)
43-
did not research what GCC compiler flags or pragmas or declaration attrs are
44-
necessery to make WIN32_API_DEBUG work on Cygwin GCC
41+
#ifdef __GNUC__
42+
# define PORTALIGN(x) __attribute__((aligned(x)))
43+
#elif defined(_MSC_VER)
44+
# define PORTALIGN(x) __declspec(align(x))
45+
#else
46+
# error unknown compiler
47+
#endif
4548

49+
/*
4650
when using WIN32_API_DEBUG change the iterations count to 1 in benchmark.t
4751
otherwise the test takes eternity
4852
*/
@@ -77,6 +81,12 @@ LARGE_INTEGER Call_asm_b4 = {0};
7781
LARGE_INTEGER Call_asm_after = {0};
7882
LARGE_INTEGER return_time = {0};
7983
LARGE_INTEGER return_time2 = {0};
84+
85+
LARGE_INTEGER start_loopprep = {0};
86+
LARGE_INTEGER loopprep_loopstart = {0};
87+
LARGE_INTEGER loopstart_Call_asm_b4 = {0};
88+
LARGE_INTEGER Call_asm_b4_Call_asm_after = {0};
89+
LARGE_INTEGER Call_asm_after_return_time = {0};
8090
# ifndef WIN64
8191
__declspec( naked ) unsigned __int64 rdtsc () {
8292
__asm
@@ -188,6 +198,49 @@ typedef struct {
188198
SV* object;
189199
} APICALLBACK;
190200

201+
/* bitfield is 4 bytes, low to high diagram\|/
202+
char flags, short stackunwind, char outType
203+
note the stackunwind is unaligned
204+
*/
205+
206+
#define CTRL_IS_MORE 0x10
207+
#define CTRL_HAS_PROTO 0x20
208+
typedef struct {
209+
union {
210+
struct {
211+
unsigned int convention: 3;
212+
unsigned int UseMI64: 1;
213+
unsigned int is_more: 1;
214+
unsigned int has_proto: 1;
215+
#ifndef _WIN64
216+
unsigned int reserved: 2;
217+
/* remember to change Call_asm in API::Call() if this is changed */
218+
unsigned int stackunwind: 16;
219+
#else
220+
unsigned int reserved: 18;
221+
#endif
222+
unsigned int out: 8;
223+
};
224+
U32 whole_bf;
225+
};
226+
U32 inparamlen; /*in units of sizeof(SV *) for comparison to items_sv
227+
param count limited to 65K in API.pm so 32 bit lengths
228+
dont overflow*/
229+
FARPROC ApiFunction;
230+
SV * api; /* a non-ref counted weak RV to the blessed SVPV that holds
231+
APICONTROL, used to optimize method calls on the API obj, the
232+
refcount for the RV is stored in the obj's hidden hash*/
233+
/* this AV is here for no func call look up of it, intypes may be NULL,
234+
refcnt owned by obj's hidden hash*/
235+
AV * intypes;
236+
/* a padding hole here of unknown size */
237+
PORTALIGN(16) APIPARAM param;
238+
} APICONTROL;
239+
240+
#define APICONTROL_CC_STD 0
241+
#define APICONTROL_CC_C 1
242+
/* fastcall, thiscall, regcall, will go here */
243+
191244
#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2*!!!(expr)]))
192245
/*
193246
because of unknown alignment where the sentinal is placed after the PV
@@ -291,7 +344,7 @@ S_croak_xs_usage(pTHX_ const CV *const cv, const char *const params)
291344
(PERL_VERSION == (V) && (PERL_SUBVERSION <= (S))))))
292345

293346
#if PERL_VERSION_LE(5, 13, 8)
294-
MAGIC * my_find_mg(SV * sv, int type, const MGVTBL *vtbl){
347+
STATIC MAGIC * my_find_mg(SV * sv, int type, const MGVTBL *vtbl){
295348
MAGIC *mg;
296349
for (mg = SvMAGIC (sv); mg; mg = mg->mg_moremagic) {
297350
if (mg->mg_type == type && mg->mg_virtual == vtbl)
@@ -303,7 +356,7 @@ MAGIC * my_find_mg(SV * sv, int type, const MGVTBL *vtbl){
303356
#endif
304357

305358
#if PERL_VERSION_LE(5, 7, 2)
306-
MAGIC *
359+
STATIC MAGIC *
307360
my_sv_magicext(pTHX_ SV* sv, SV* obj, int how, MGVTBL *vtable,
308361
const char* name, I32 namlen)
309362
{

API.pm

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,7 @@ sub new {
219219
$self->{weakapi} = \$control;
220220
weaken($self->{weakapi});
221221
$control = pack( 'L'
222-
.'S'
223-
.'S' #padding
222+
.'L'
224223
.(PTRSIZE == 8 ? 'Q' : 'L')
225224
.(PTRSIZE == 8 ? 'Q' : 'L')
226225
.(PTRSIZE == 8 ? 'Q' : 'L')
@@ -230,8 +229,7 @@ sub new {
230229
| $ccnum
231230
| (PTRSIZE == 8 ? 0 : $stackunwind << 8)
232231
| $outnum << 24
233-
, scalar(@{$self->{in}}) #in param count
234-
, 0 #padding
232+
, scalar(@{$self->{in}}) * PTRSIZE #in param count, in SV * units
235233
, $hproc
236234
, \($self->{weakapi})+0 #weak api obj ref
237235
, (exists $self->{intypes} ? ($self->{intypes})+0 : 0)
@@ -984,10 +982,10 @@ be treated as a Math::Int64 object without having to previously call
984982
L</UseMI64>.
985983
986984
=item C<F>:
987-
value is a floating point number (float)
985+
value is a single precision (4 bytes) floating point number (float)
988986
989987
=item C<D>:
990-
value is a double precision number (double)
988+
value is a double precision (8 bytes) floating point number (double)
991989
992990
=item C<S>:
993991
value is a unsigned short (unsigned short)

API.xs

Lines changed: 40 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#include "perl.h"
1818
#include "XSUB.h"
1919
#define CROAK croak
20-
#include <emmintrin.h>
20+
//#include <emmintrin.h>
2121
#include "API.h"
2222

2323
/*
@@ -75,7 +75,16 @@ BOOL WINAPI _DllMainCRTStartup(
7575
}
7676
#endif
7777

78-
const static struct {
78+
#ifdef _MSC_VER
79+
extern __declspec(selectany) /*enable comdat folding for this symbol in msvc*/
80+
#endif
81+
PORTALIGN(1) const char bad_esp_msg [] = "Win32::API a function was called with the wrong prototype "
82+
"and caused a C stack inconsistency EBP=%p ESP=%p" ;
83+
84+
#ifdef _MSC_VER
85+
extern __declspec(selectany) /*enable comdat folding for this symbol in msvc*/
86+
#endif
87+
PORTALIGN(1) const struct {
7988
char Unpack [sizeof("Win32::API::Type::Unpack")];
8089
char Pack [sizeof("Win32::API::Type::Pack")];
8190
char ck_type [sizeof("Win32::API::Struct::ck_type")];
@@ -106,9 +115,6 @@ STATIC SV * getTarg(pTHX) {
106115
return TARG;
107116
}
108117

109-
const char bad_esp_msg [] = "Win32::API a function was called with the wrong prototype "
110-
"and cause a C stack inconsistency EBP=%"UVxf" EBP=%"UVxf ;
111-
112118
/* Convert wide character string to mortal SV. Use UTF8 encoding
113119
* if the string cannot be represented in the system codepage.
114120
* If wlen isn't -1 (calculate length), wlen must include the null wchar
@@ -171,59 +177,6 @@ STATIC void w32sv_setwstr(pTHX_ SV * sv, WCHAR *wstr, INT_PTR wlenparam) {
171177
}
172178
if(tempwstr) Safefree(tempwstr);
173179
}
174-
/* bitfield is 4 bytes, low to high diagram\|/
175-
char flags, short stackunwind, char outType
176-
note the stackunwind is unaligned
177-
*/
178-
179-
#define CTRL_IS_MORE 0x10
180-
#define CTRL_HAS_PROTO 0x20
181-
typedef struct {
182-
union {
183-
struct {
184-
unsigned int convention: 3;
185-
unsigned int UseMI64: 1;
186-
unsigned int is_more: 1;
187-
unsigned int has_proto: 1;
188-
#ifndef _WIN64
189-
unsigned int reserved: 2;
190-
/* remember to change Call_asm in API::Call() if this is changed */
191-
unsigned int stackunwind: 16;
192-
#else
193-
unsigned int reserved: 18;
194-
#endif
195-
unsigned int out: 8;
196-
};
197-
U32 whole_bf;
198-
};
199-
U16 inparamlen;
200-
/* padding hole here, 2 bytes, 32 and 64*/
201-
FARPROC ApiFunction;
202-
SV * api; /* a non-ref counted weak RV to the blessed SVPV that holds
203-
APICONTROL, used to optimize method calls on the API obj, the
204-
refcount for the RV is stored in the obj's hidden hash*/
205-
/* this AV is here for no func call look up of it, intypes may be NULL*/
206-
AV * intypes;
207-
/* a padding hole here of unknown size */
208-
__declspec(align(16)) APIPARAM param;
209-
} APICONTROL;
210-
211-
#define APICONTROL_CC_STD 0
212-
#define APICONTROL_CC_C 1
213-
//fastcall, thiscall, regcall, will go here
214-
215-
typedef struct {
216-
/* on 32bit win, HeapAlloc granularity is 8 bytes, if you request less than
217-
size%8 == 0 request is rounded upto next 8, lets assume that
218-
struct perl_memory_debug_header, the HE, and HEK (all if applicable), will
219-
be some multiple of 4 on 32bit windows, since the string is null terminated
220-
even on pre-HEK stash name Perls (< 5.9.3), there are atleast 4 bytes
221-
readable at all times for HvNAME. */
222-
DWORD32 MagicLow;
223-
DWORD32 MagicHigh;
224-
DWORD_PTR EncodedPtr; /* nullless XOR encrypted APICONTROL */
225-
DWORD_PTR PtrKey; /* key to decrypt above ptr */
226-
} APICLASSNAME;
227180

228181
#if defined(_M_AMD64) || defined(__x86_64)
229182
#include "call_x86_64.h"
@@ -273,7 +226,7 @@ const static struct {
273226
STATIC SV * getMgSV(pTHX_ SV * sv) {
274227
MAGIC * mg;
275228
if(SvRMAGICAL(sv)) { /* implies SvTYPE >= SVt_PVMG */
276-
mg = mg_findext(sv, PERL_MAGIC_ext, &vtbl_API);
229+
mg = mg_findext(sv, PERL_MAGIC_ext, (const MGVTBL * const)&vtbl_API);
277230
if(mg) {
278231
return mg->mg_obj;
279232
}
@@ -286,7 +239,7 @@ STATIC SV * getMgSV(pTHX_ SV * sv) {
286239
STATIC void setMgSV(pTHX_ SV * sv, SV * newsv) {
287240
MAGIC * mg;
288241
if(SvRMAGICAL(sv)) { /* implies SvTYPE >= SVt_PVMG */
289-
mg = mg_findext(sv, PERL_MAGIC_ext, &vtbl_API);
242+
mg = mg_findext(sv, PERL_MAGIC_ext, (const MGVTBL * const)&vtbl_API);
290243
if(mg) {
291244
SV * oldsv;
292245
SvREFCNT_inc_simple_void_NN(newsv);
@@ -299,7 +252,7 @@ STATIC void setMgSV(pTHX_ SV * sv, SV * newsv) {
299252
}
300253
else {
301254
addmg:
302-
sv_magicext(sv,newsv,PERL_MAGIC_ext,&vtbl_API,NULL,0);
255+
sv_magicext(sv,newsv,PERL_MAGIC_ext,(const MGVTBL * const)&vtbl_API,NULL,0);
303256
}
304257
}
305258

@@ -368,6 +321,9 @@ BOOT:
368321
unsigned char len;
369322
unsigned char constval;
370323
} CONSTREG;
324+
#pragma pack(push)
325+
#pragma pack(push, 1)
326+
PORTALIGN(1)
371327
static const struct {
372328
#define XMM(y) CONSTREG cr_##y; char arr_##y [sizeof(#y)];
373329
XMM(T_VOID)
@@ -415,6 +371,8 @@ BOOT:
415371
XMM(T_FLAG_NUMERIC)
416372
#undef XMM
417373
};
374+
#pragma pack(pop)
375+
#pragma pack(pop)
418376
CONSTREG * entry = (CONSTREG *)&const_init;
419377
while((DWORD_PTR)entry < (DWORD_PTR)&const_init+sizeof(const_init)){
420378
newCONSTSUB(stash, (char *)((DWORD_PTR)entry+sizeof(CONSTREG)), newSVuv(entry->constval));
@@ -809,11 +767,6 @@ void
809767
_ImportXS(...)
810768
PREINIT:
811769
char * subname;
812-
#ifdef W32A_SPLITHEAD
813-
XS_EUPXS(XS_Win32__API_ImportCall);
814-
#else
815-
XS_EUPXS(XS_Win32__API_Call);
816-
#endif
817770
#if (PERL_REVISION == 5 && PERL_VERSION < 9)
818771
char* file = __FILE__;
819772
#else
@@ -827,11 +780,7 @@ CODE:
827780
subname = SvPVX(sv); }
828781
{ SV * api = POPs;
829782
PUTBACK;
830-
#ifdef W32A_SPLITHEAD
831783
{ CV * cv = newXS(subname, XS_Win32__API_ImportCall, file);
832-
#else
833-
{ CV * cv = newXS(subname, XS_Win32__API_Call, file);
834-
#endif
835784
XSANY.any_ptr = (APICONTROL *) SvPVX(SvRV(api));
836785
setMgSV(aTHX_ (SV*)cv, api); }}
837786
return;
@@ -863,3 +812,24 @@ PPCODE:
863812
croak("bad alignment");
864813
#endif
865814
return;
815+
816+
#ifdef WIN32_API_PROF
817+
void
818+
_DumpTimes()
819+
CODE:
820+
printf("dumptimes start %I64u loopprep %I64u loopstart %I64u Call_asm_b4 %I64u Call_asm_after %I64u rtn_time\n",
821+
start_loopprep.QuadPart, loopprep_loopstart.QuadPart, loopstart_Call_asm_b4.QuadPart, Call_asm_b4_Call_asm_after.QuadPart, Call_asm_after_return_time.QuadPart);
822+
823+
#endif
824+
825+
#ifdef WIN32_API_PROF
826+
void
827+
_ResetTimes()
828+
CODE:
829+
start_loopprep.QuadPart = 0,
830+
loopprep_loopstart.QuadPart = 0,
831+
loopstart_Call_asm_b4.QuadPart = 0,
832+
Call_asm_b4_Call_asm_after.QuadPart = 0,
833+
Call_asm_after_return_time.QuadPart = 0;
834+
835+
#endif

0 commit comments

Comments
 (0)