Updated libsharp to commit 0787838ab3ec8afc0c28b98479a321ffba388980

This commit is contained in:
Guilhem Lavaux 2016-11-04 18:14:49 +01:00
parent a933430c60
commit 23aa450a77
62 changed files with 5075 additions and 11205 deletions

View File

@ -19,6 +19,8 @@ include libfftpack/planck.make
include libsharp/planck.make include libsharp/planck.make
include docsrc/planck.make include docsrc/planck.make
CYTHON_MODULES=python/libsharp/libsharp.so $(if $(MPI_CFLAGS), python/libsharp/libsharp_mpi.so)
$(all_lib): %: | $(LIBDIR)_mkdir $(all_lib): %: | $(LIBDIR)_mkdir
@echo "# creating library $*" @echo "# creating library $*"
$(ARCREATE) $@ $^ $(ARCREATE) $@ $^
@ -38,18 +40,39 @@ hdrcopy: | $(INCDIR)_mkdir
$(notdir $(all_cbin)) : % : $(BINDIR)/% $(notdir $(all_cbin)) : % : $(BINDIR)/%
test: compile_all test: compile_all
$(BINDIR)/sharp_acctest && \ $(BINDIR)/sharp_testsuite acctest && \
$(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \ $(BINDIR)/sharp_testsuite test healpix 2048 -1 1024 -1 0 1 && \
$(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \ $(BINDIR)/sharp_testsuite test fejer1 2047 -1 -1 4096 2 1 && \
$(BINDIR)/sharp_test gauss 2047 4096 0 0 2 $(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 2
perftest: compile_all perftest: compile_all
$(BINDIR)/sharp_test healpix 2048 1024 0 0 1 && \ $(BINDIR)/sharp_testsuite test healpix 2048 -1 1024 -1 0 1 && \
$(BINDIR)/sharp_test gauss 63 128 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 63 -1 -1 128 0 1 && \
$(BINDIR)/sharp_test gauss 127 256 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 127 -1 -1 256 0 1 && \
$(BINDIR)/sharp_test gauss 255 512 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 255 -1 -1 512 0 1 && \
$(BINDIR)/sharp_test gauss 511 1024 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 511 -1 -1 1024 0 1 && \
$(BINDIR)/sharp_test gauss 1023 2048 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 1023 -1 -1 2048 0 1 && \
$(BINDIR)/sharp_test gauss 2047 4096 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \
$(BINDIR)/sharp_test gauss 4095 8192 0 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \
$(BINDIR)/sharp_test gauss 8191 16384 0 0 1 $(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1
%.c: %.c.in
# Only do this if the md5sum changed, in order to avoid Python and Jinja
# dependency when not modifying the c.in file
grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@
genclean:
rm libsharp/sharp_legendre.c || exit 0
$(CYTHON_MODULES): %.so: %.pyx
ifndef PIC_CFLAGS
$(error Python extension must be built using the --enable-pic configure option.)
endif
cython $<
$(CC) $(DEBUG_CFLAGS) $(OPENMP_CFLAGS) $(PIC_CFLAGS) `python-config --cflags` -I$(INCDIR) -o $(<:.pyx=.o) -c $(<:.pyx=.c)
$(CL) -shared $(<:.pyx=.o) $(OPENMP_CFLAGS) $(CYTHON_OBJ) -L$(LIBDIR) -lsharp -lfftpack -lc_utils -L`python-config --prefix`/lib `python-config --ldflags` -o $@
python: $(all_lib) hdrcopy $(CYTHON_MODULES)
pytest: python
cd python && nosetests --nocapture libsharp/tests/test_sht.py

View File

@ -1,13 +0,0 @@
GNU make and GNU gcc (version 4.x) are required for compilation.
Simply run "./configure"; if this fails, please refer to the output of
"./configure --help" for additional hints and, if necessary, provide
additional flags to the configure script.
Once the script finishes successfully, run "make"
(or "gmake"). This should install the compilation products in the
subdirectory "auto/".
Documentation can be created by the command "(g)make doc".
However this requires the doxygen application to be installed
on your system.
The documentation will be created in the subdirectory doc/.

43
external/sharp/README.md vendored Normal file
View File

@ -0,0 +1,43 @@
# Libsharp
*IMPORTANT NOTE*: It appears that the default branch upon cloning from
github.com/dagss/libsharp was an outdated 'dagss' branch instead of
the 'master' branch. To get the latest copy,
please do `git checkout master; git pull`. New clones are no longer affected.
## Paper
https://arxiv.org/abs/1303.4945
## Compilation
GNU make is required for compilation.
Libsharp compilation has been successfully tested with GNU and Intel compilers.
When using gcc, version 4.x is required [1].
Since libsharp was written in standard C99, other compilers should work fine,
but SSE2/AVX support will most likely be deactivated.
If you obtained libsharp directly from the git repository, you will also
need a copy of the GNU autotools. In this case, run "autoconf" in libsharp's
main directory before any other steps.
For libsharp releases distributed as a .tar.gz file, this step is not necessary.
Afterwards, simply run "./configure"; if this fails, please refer to the output
of "./configure --help" for additional hints and, if necessary, provide
additional flags to the configure script.
Once the script finishes successfully, run "make"
(or "gmake"). This should install the compilation products in the
subdirectory "auto/".
Documentation can be created by the command "(g)make doc".
However this requires the doxygen application to be installed
on your system.
The documentation will be created in the subdirectory doc/.
[1] Some versions of the gcc 4.4.x release series contain a bug which causes
the compiler to crash during libsharp compilation. This appears to be fixed
in the gcc 4.4.7 release. It is possible to work around this problem by adding
the compiler flag "-fno-tree-fre" after the other optimization flags - the
configure script should do this automatically.

File diff suppressed because it is too large Load Diff

View File

@ -1,79 +0,0 @@
# This file was generated by Autom4te Sun Nov 6 20:57:04 UTC 2011.
# It contains the lists of macros which have been traced.
# It can be safely removed.
@request = (
bless( [
'0',
1,
[
'/usr/share/autoconf'
],
[
'/usr/share/autoconf/autoconf/autoconf.m4f',
'configure.ac'
],
{
'AM_PROG_F77_C_O' => 1,
'_LT_AC_TAGCONFIG' => 1,
'm4_pattern_forbid' => 1,
'AC_INIT' => 1,
'AC_CANONICAL_TARGET' => 1,
'_AM_COND_IF' => 1,
'AC_CONFIG_LIBOBJ_DIR' => 1,
'AC_SUBST' => 1,
'AC_CANONICAL_HOST' => 1,
'AC_FC_SRCEXT' => 1,
'AC_DEFUN' => 1,
'AC_PROG_LIBTOOL' => 1,
'AM_INIT_AUTOMAKE' => 1,
'AC_CONFIG_SUBDIRS' => 1,
'AM_PATH_GUILE' => 1,
'AM_AUTOMAKE_VERSION' => 1,
'LT_CONFIG_LTDL_DIR' => 1,
'AC_CONFIG_LINKS' => 1,
'AC_REQUIRE_AUX_FILE' => 1,
'LT_SUPPORTED_TAG' => 1,
'm4_sinclude' => 1,
'AM_MAINTAINER_MODE' => 1,
'AC_DEFUN_ONCE' => 1,
'AM_NLS' => 1,
'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
'_m4_warn' => 1,
'AM_MAKEFILE_INCLUDE' => 1,
'AM_PROG_CXX_C_O' => 1,
'_AM_MAKEFILE_INCLUDE' => 1,
'_AM_COND_ENDIF' => 1,
'AM_ENABLE_MULTILIB' => 1,
'AM_SILENT_RULES' => 1,
'AM_PROG_MOC' => 1,
'AC_CONFIG_FILES' => 1,
'LT_INIT' => 1,
'include' => 1,
'AM_GNU_GETTEXT' => 1,
'AM_PROG_AR' => 1,
'AC_LIBSOURCE' => 1,
'AC_CANONICAL_BUILD' => 1,
'AM_PROG_FC_C_O' => 1,
'AC_FC_FREEFORM' => 1,
'AH_OUTPUT' => 1,
'AC_CONFIG_AUX_DIR' => 1,
'_AM_SUBST_NOTMAKE' => 1,
'm4_pattern_allow' => 1,
'_AM_AUTOCONF_VERSION' => 1,
'AM_PROG_CC_C_O' => 1,
'sinclude' => 1,
'AM_CONDITIONAL' => 1,
'AC_CANONICAL_SYSTEM' => 1,
'AM_XGETTEXT_OPTION' => 1,
'AC_CONFIG_HEADERS' => 1,
'AC_DEFINE_TRACE_LITERAL' => 1,
'AM_POT_TOOLS' => 1,
'm4_include' => 1,
'_AM_COND_ELSE' => 1,
'AU_DEFUN' => 1,
'AC_SUBST_TRACE' => 1
}
], 'Autom4te::Request' )
);

View File

@ -1,229 +0,0 @@
m4trace:configure.ac:1: -1- AC_INIT([config/config.auto.in])
m4trace:configure.ac:1: -1- m4_pattern_forbid([^_?A[CHUM]_])
m4trace:configure.ac:1: -1- m4_pattern_forbid([_AC_])
m4trace:configure.ac:1: -1- m4_pattern_forbid([^LIBOBJS$], [do not use LIBOBJS directly, use AC_LIBOBJ (see section `AC_LIBOBJ vs LIBOBJS'])
m4trace:configure.ac:1: -1- m4_pattern_allow([^AS_FLAGS$])
m4trace:configure.ac:1: -1- m4_pattern_forbid([^_?m4_])
m4trace:configure.ac:1: -1- m4_pattern_forbid([^dnl$])
m4trace:configure.ac:1: -1- m4_pattern_forbid([^_?AS_])
m4trace:configure.ac:1: -1- AC_SUBST([SHELL])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([SHELL])
m4trace:configure.ac:1: -1- m4_pattern_allow([^SHELL$])
m4trace:configure.ac:1: -1- AC_SUBST([PATH_SEPARATOR])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PATH_SEPARATOR])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PATH_SEPARATOR$])
m4trace:configure.ac:1: -1- AC_SUBST([PACKAGE_NAME], [m4_ifdef([AC_PACKAGE_NAME], ['AC_PACKAGE_NAME'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PACKAGE_NAME])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_NAME$])
m4trace:configure.ac:1: -1- AC_SUBST([PACKAGE_TARNAME], [m4_ifdef([AC_PACKAGE_TARNAME], ['AC_PACKAGE_TARNAME'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PACKAGE_TARNAME])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_TARNAME$])
m4trace:configure.ac:1: -1- AC_SUBST([PACKAGE_VERSION], [m4_ifdef([AC_PACKAGE_VERSION], ['AC_PACKAGE_VERSION'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PACKAGE_VERSION])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_VERSION$])
m4trace:configure.ac:1: -1- AC_SUBST([PACKAGE_STRING], [m4_ifdef([AC_PACKAGE_STRING], ['AC_PACKAGE_STRING'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PACKAGE_STRING])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_STRING$])
m4trace:configure.ac:1: -1- AC_SUBST([PACKAGE_BUGREPORT], [m4_ifdef([AC_PACKAGE_BUGREPORT], ['AC_PACKAGE_BUGREPORT'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PACKAGE_BUGREPORT])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_BUGREPORT$])
m4trace:configure.ac:1: -1- AC_SUBST([PACKAGE_URL], [m4_ifdef([AC_PACKAGE_URL], ['AC_PACKAGE_URL'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([PACKAGE_URL])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_URL$])
m4trace:configure.ac:1: -1- AC_SUBST([exec_prefix], [NONE])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([exec_prefix])
m4trace:configure.ac:1: -1- m4_pattern_allow([^exec_prefix$])
m4trace:configure.ac:1: -1- AC_SUBST([prefix], [NONE])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([prefix])
m4trace:configure.ac:1: -1- m4_pattern_allow([^prefix$])
m4trace:configure.ac:1: -1- AC_SUBST([program_transform_name], [s,x,x,])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([program_transform_name])
m4trace:configure.ac:1: -1- m4_pattern_allow([^program_transform_name$])
m4trace:configure.ac:1: -1- AC_SUBST([bindir], ['${exec_prefix}/bin'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([bindir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^bindir$])
m4trace:configure.ac:1: -1- AC_SUBST([sbindir], ['${exec_prefix}/sbin'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([sbindir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^sbindir$])
m4trace:configure.ac:1: -1- AC_SUBST([libexecdir], ['${exec_prefix}/libexec'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([libexecdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^libexecdir$])
m4trace:configure.ac:1: -1- AC_SUBST([datarootdir], ['${prefix}/share'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([datarootdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^datarootdir$])
m4trace:configure.ac:1: -1- AC_SUBST([datadir], ['${datarootdir}'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([datadir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^datadir$])
m4trace:configure.ac:1: -1- AC_SUBST([sysconfdir], ['${prefix}/etc'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([sysconfdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^sysconfdir$])
m4trace:configure.ac:1: -1- AC_SUBST([sharedstatedir], ['${prefix}/com'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([sharedstatedir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^sharedstatedir$])
m4trace:configure.ac:1: -1- AC_SUBST([localstatedir], ['${prefix}/var'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([localstatedir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^localstatedir$])
m4trace:configure.ac:1: -1- AC_SUBST([includedir], ['${prefix}/include'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([includedir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^includedir$])
m4trace:configure.ac:1: -1- AC_SUBST([oldincludedir], ['/usr/include'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([oldincludedir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^oldincludedir$])
m4trace:configure.ac:1: -1- AC_SUBST([docdir], [m4_ifset([AC_PACKAGE_TARNAME],
['${datarootdir}/doc/${PACKAGE_TARNAME}'],
['${datarootdir}/doc/${PACKAGE}'])])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([docdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^docdir$])
m4trace:configure.ac:1: -1- AC_SUBST([infodir], ['${datarootdir}/info'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([infodir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^infodir$])
m4trace:configure.ac:1: -1- AC_SUBST([htmldir], ['${docdir}'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([htmldir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^htmldir$])
m4trace:configure.ac:1: -1- AC_SUBST([dvidir], ['${docdir}'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([dvidir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^dvidir$])
m4trace:configure.ac:1: -1- AC_SUBST([pdfdir], ['${docdir}'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([pdfdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^pdfdir$])
m4trace:configure.ac:1: -1- AC_SUBST([psdir], ['${docdir}'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([psdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^psdir$])
m4trace:configure.ac:1: -1- AC_SUBST([libdir], ['${exec_prefix}/lib'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([libdir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^libdir$])
m4trace:configure.ac:1: -1- AC_SUBST([localedir], ['${datarootdir}/locale'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([localedir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^localedir$])
m4trace:configure.ac:1: -1- AC_SUBST([mandir], ['${datarootdir}/man'])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([mandir])
m4trace:configure.ac:1: -1- m4_pattern_allow([^mandir$])
m4trace:configure.ac:1: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_NAME])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_NAME$])
m4trace:configure.ac:1: -1- AH_OUTPUT([PACKAGE_NAME], [/* Define to the full name of this package. */
@%:@undef PACKAGE_NAME])
m4trace:configure.ac:1: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_TARNAME])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_TARNAME$])
m4trace:configure.ac:1: -1- AH_OUTPUT([PACKAGE_TARNAME], [/* Define to the one symbol short name of this package. */
@%:@undef PACKAGE_TARNAME])
m4trace:configure.ac:1: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_VERSION])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_VERSION$])
m4trace:configure.ac:1: -1- AH_OUTPUT([PACKAGE_VERSION], [/* Define to the version of this package. */
@%:@undef PACKAGE_VERSION])
m4trace:configure.ac:1: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_STRING])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_STRING$])
m4trace:configure.ac:1: -1- AH_OUTPUT([PACKAGE_STRING], [/* Define to the full name and version of this package. */
@%:@undef PACKAGE_STRING])
m4trace:configure.ac:1: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_BUGREPORT])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_BUGREPORT$])
m4trace:configure.ac:1: -1- AH_OUTPUT([PACKAGE_BUGREPORT], [/* Define to the address where bug reports for this package should be sent. */
@%:@undef PACKAGE_BUGREPORT])
m4trace:configure.ac:1: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_URL])
m4trace:configure.ac:1: -1- m4_pattern_allow([^PACKAGE_URL$])
m4trace:configure.ac:1: -1- AH_OUTPUT([PACKAGE_URL], [/* Define to the home page for this package. */
@%:@undef PACKAGE_URL])
m4trace:configure.ac:1: -1- AC_SUBST([DEFS])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([DEFS])
m4trace:configure.ac:1: -1- m4_pattern_allow([^DEFS$])
m4trace:configure.ac:1: -1- AC_SUBST([ECHO_C])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([ECHO_C])
m4trace:configure.ac:1: -1- m4_pattern_allow([^ECHO_C$])
m4trace:configure.ac:1: -1- AC_SUBST([ECHO_N])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([ECHO_N])
m4trace:configure.ac:1: -1- m4_pattern_allow([^ECHO_N$])
m4trace:configure.ac:1: -1- AC_SUBST([ECHO_T])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([ECHO_T])
m4trace:configure.ac:1: -1- m4_pattern_allow([^ECHO_T$])
m4trace:configure.ac:1: -1- AC_SUBST([LIBS])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([LIBS])
m4trace:configure.ac:1: -1- m4_pattern_allow([^LIBS$])
m4trace:configure.ac:1: -1- AC_SUBST([build_alias])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([build_alias])
m4trace:configure.ac:1: -1- m4_pattern_allow([^build_alias$])
m4trace:configure.ac:1: -1- AC_SUBST([host_alias])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([host_alias])
m4trace:configure.ac:1: -1- m4_pattern_allow([^host_alias$])
m4trace:configure.ac:1: -1- AC_SUBST([target_alias])
m4trace:configure.ac:1: -1- AC_SUBST_TRACE([target_alias])
m4trace:configure.ac:1: -1- m4_pattern_allow([^target_alias$])
m4trace:configure.ac:3: -1- AC_SUBST([uname_found])
m4trace:configure.ac:3: -1- AC_SUBST_TRACE([uname_found])
m4trace:configure.ac:3: -1- m4_pattern_allow([^uname_found$])
m4trace:configure.ac:12: -1- _m4_warn([obsolete], [The macro `AC_TRY_COMPILE' is obsolete.
You should run autoupdate.], [../../lib/autoconf/general.m4:2615: AC_TRY_COMPILE is expanded from...
configure.ac:12: the top level])
m4trace:configure.ac:12: -1- AC_SUBST([CC])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CC])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CC$])
m4trace:configure.ac:12: -1- AC_SUBST([CFLAGS])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CFLAGS])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CFLAGS$])
m4trace:configure.ac:12: -1- AC_SUBST([LDFLAGS])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([LDFLAGS])
m4trace:configure.ac:12: -1- m4_pattern_allow([^LDFLAGS$])
m4trace:configure.ac:12: -1- AC_SUBST([LIBS])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([LIBS])
m4trace:configure.ac:12: -1- m4_pattern_allow([^LIBS$])
m4trace:configure.ac:12: -1- AC_SUBST([CPPFLAGS])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CPPFLAGS])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CPPFLAGS$])
m4trace:configure.ac:12: -1- AC_SUBST([CC])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CC])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CC$])
m4trace:configure.ac:12: -1- AC_SUBST([CC])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CC])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CC$])
m4trace:configure.ac:12: -1- AC_SUBST([CC])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CC])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CC$])
m4trace:configure.ac:12: -1- AC_SUBST([CC])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([CC])
m4trace:configure.ac:12: -1- m4_pattern_allow([^CC$])
m4trace:configure.ac:12: -1- AC_SUBST([ac_ct_CC])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([ac_ct_CC])
m4trace:configure.ac:12: -1- m4_pattern_allow([^ac_ct_CC$])
m4trace:configure.ac:12: -1- AC_SUBST([EXEEXT], [$ac_cv_exeext])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([EXEEXT])
m4trace:configure.ac:12: -1- m4_pattern_allow([^EXEEXT$])
m4trace:configure.ac:12: -1- AC_SUBST([OBJEXT], [$ac_cv_objext])
m4trace:configure.ac:12: -1- AC_SUBST_TRACE([OBJEXT])
m4trace:configure.ac:12: -1- m4_pattern_allow([^OBJEXT$])
m4trace:configure.ac:20: -1- AC_SUBST([OPENMP_CFLAGS])
m4trace:configure.ac:20: -1- AC_SUBST_TRACE([OPENMP_CFLAGS])
m4trace:configure.ac:20: -1- m4_pattern_allow([^OPENMP_CFLAGS$])
m4trace:configure.ac:55: -1- AC_SUBST([GCCVERSION])
m4trace:configure.ac:55: -1- AC_SUBST_TRACE([GCCVERSION])
m4trace:configure.ac:55: -1- m4_pattern_allow([^GCCVERSION$])
m4trace:configure.ac:106: -1- AC_SUBST([SILENT_RULE])
m4trace:configure.ac:106: -1- AC_SUBST_TRACE([SILENT_RULE])
m4trace:configure.ac:106: -1- m4_pattern_allow([^SILENT_RULE$])
m4trace:configure.ac:107: -1- AC_SUBST([CC])
m4trace:configure.ac:107: -1- AC_SUBST_TRACE([CC])
m4trace:configure.ac:107: -1- m4_pattern_allow([^CC$])
m4trace:configure.ac:108: -1- AC_SUBST([CCFLAGS_NO_C])
m4trace:configure.ac:108: -1- AC_SUBST_TRACE([CCFLAGS_NO_C])
m4trace:configure.ac:108: -1- m4_pattern_allow([^CCFLAGS_NO_C$])
m4trace:configure.ac:109: -1- AC_SUBST([LDCCFLAGS])
m4trace:configure.ac:109: -1- AC_SUBST_TRACE([LDCCFLAGS])
m4trace:configure.ac:109: -1- m4_pattern_allow([^LDCCFLAGS$])
m4trace:configure.ac:110: -1- AC_SUBST([ARCREATE])
m4trace:configure.ac:110: -1- AC_SUBST_TRACE([ARCREATE])
m4trace:configure.ac:110: -1- m4_pattern_allow([^ARCREATE$])
m4trace:configure.ac:112: -1- AC_CONFIG_FILES([config/config.auto])
m4trace:configure.ac:112: -1- _m4_warn([obsolete], [AC_OUTPUT should be used without arguments.
You should run autoupdate.], [])
m4trace:configure.ac:112: -1- AC_SUBST([LIB@&t@OBJS], [$ac_libobjs])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([LIB@&t@OBJS])
m4trace:configure.ac:112: -1- m4_pattern_allow([^LIB@&t@OBJS$])
m4trace:configure.ac:112: -1- AC_SUBST([LTLIBOBJS], [$ac_ltlibobjs])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([LTLIBOBJS])
m4trace:configure.ac:112: -1- m4_pattern_allow([^LTLIBOBJS$])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([top_builddir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([top_build_prefix])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([srcdir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([abs_srcdir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([top_srcdir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([abs_top_srcdir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([builddir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([abs_builddir])
m4trace:configure.ac:112: -1- AC_SUBST_TRACE([abs_top_builddir])

View File

@ -113,10 +113,10 @@ void util_free_ (void *ptr);
#define ALLOC2D(ptr,type,num1,num2) \ #define ALLOC2D(ptr,type,num1,num2) \
do { \ do { \
size_t cnt_, num1_=(num1), num2_=(num2); \ size_t cnt_, num1_=(num1), num2_=(num2); \
ALLOC(ptr,type *,num1_); \ ALLOC((ptr),type *,num1_); \
ALLOC(ptr[0],type,num1_*num2_); \ ALLOC((ptr)[0],type,num1_*num2_); \
for (cnt_=1; cnt_<num1_; ++cnt_) \ for (cnt_=1; cnt_<num1_; ++cnt_) \
ptr[cnt_]=ptr[cnt_-1]+num2_; \ (ptr)[cnt_]=(ptr)[cnt_-1]+num2_; \
} while(0) } while(0)
#define DEALLOC2D(ptr) \ #define DEALLOC2D(ptr) \
do { if(ptr) DEALLOC((ptr)[0]); DEALLOC(ptr); } while(0) do { if(ptr) DEALLOC((ptr)[0]); DEALLOC(ptr); } while(0)

View File

@ -38,7 +38,8 @@ double residentSetSize(void)
FILE *statm = fopen("/proc/self/statm","r"); FILE *statm = fopen("/proc/self/statm","r");
double res; double res;
if (!statm) return -1.0; if (!statm) return -1.0;
fscanf(statm,"%*f %lf",&res); if (fscanf(statm,"%*f %lf",&res))
{ fclose(statm); return -1.0; }
fclose(statm); fclose(statm);
return (res*4096); return (res*4096);
} }
@ -55,8 +56,8 @@ double VmHWM(void)
{ fclose(f); return -1.0; } { fclose(f); return -1.0; }
if (!strncmp(word, "VmHWM:", 6)) if (!strncmp(word, "VmHWM:", 6))
{ {
fscanf(f,"%lf",&res); if (fscanf(f,"%lf%2s",&res,word)<0)
fscanf(f,"%2s",word); { fclose(f); return -1.0; }
if (strncmp(word, "kB", 2)) if (strncmp(word, "kB", 2))
{ fclose(f); return -1.0; } { fclose(f); return -1.0; }
res *=1024; res *=1024;

View File

@ -5,5 +5,8 @@ CL=@CC@
CCFLAGS_NO_C=@CCFLAGS_NO_C@ CCFLAGS_NO_C=@CCFLAGS_NO_C@
CCFLAGS=$(CCFLAGS_NO_C) -c CCFLAGS=$(CCFLAGS_NO_C) -c
CLFLAGS=-L. -L$(LIBDIR) @LDCCFLAGS@ -lm CLFLAGS=-L. -L$(LIBDIR) @LDCCFLAGS@ -lm
DEBUG_CFLAGS=@DEBUG_CFLAGS@
MPI_CFLAGS=@MPI_CFLAGS@
OPENMP_CFLAGS=@OPENMP_CFLAGS@
PIC_CFLAGS=@PIC_CFLAGS@
ARCREATE=@ARCREATE@ ARCREATE=@ARCREATE@

View File

@ -1,9 +1,10 @@
BLDROOT = $(SRCROOT)/build.$(SHARP_TARGET) BLDROOT = $(SRCROOT)/build.$(SHARP_TARGET)
PREFIX = $(SRCROOT)/$(SHARP_TARGET) PREFIX = $(SRCROOT)/$(SHARP_TARGET)
BINDIR = $(PREFIX)/bin BINDIR = $(PREFIX)/bin
INCDIR = $(PREFIX)/include INCDIR = $(PREFIX)/include
LIBDIR = $(PREFIX)/lib LIBDIR = $(PREFIX)/lib
DOCDIR = $(SRCROOT)/doc DOCDIR = $(SRCROOT)/doc
PYTHONDIR = $(SRCROOT)/python/libsharp
# do not use any suffix rules # do not use any suffix rules
.SUFFIXES: .SUFFIXES:
@ -26,6 +27,7 @@ $(BLDROOT)/%.o : $(SRCROOT)/%.cc | echo_config
clean: clean:
rm -rf $(BLDROOT) $(PREFIX) $(DOCDIR) autom4te.cache/ config.log config.status rm -rf $(BLDROOT) $(PREFIX) $(DOCDIR) autom4te.cache/ config.log config.status
rm -rf $(PYTHONDIR)/*.c $(PYTHONDIR)/*.o $(PYTHONDIR)/*.so
distclean: clean distclean: clean
rm -f config/config.auto rm -f config/config.auto

3974
external/sharp/configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -53,19 +53,16 @@ case $CCTYPE in
GCCVERSION="`$CC -dumpversion 2>&1`" GCCVERSION="`$CC -dumpversion 2>&1`"
echo "Using gcc version $GCCVERSION" echo "Using gcc version $GCCVERSION"
AC_SUBST(GCCVERSION) AC_SUBST(GCCVERSION)
case $system in
Darwin-*)
;;
*)
CCFLAGS="$CCFLAGS -ffunction-sections -fdata-sections"
;;
esac
changequote(,) changequote(,)
gcc43=`echo $GCCVERSION | grep -c '4\.[3456789]'` gcc43=`echo $GCCVERSION | grep -c '^4\.[3456789]'`
gcc44=`echo $GCCVERSION | grep -c '^4\.4'`
changequote([,]) changequote([,])
if test $gcc43 -gt 0; then if test $gcc43 -gt 0; then
CCFLAGS="$CCFLAGS -march=native" CCFLAGS="$CCFLAGS -march=native"
fi fi
if test $gcc44 -gt 0; then
CCFLAGS="$CCFLAGS -fno-tree-fre"
fi
;; ;;
icc) icc)
CCFLAGS="-O3 -xHOST -std=c99 -ip -Wbrief -Wall -vec-report0 -openmp-report0 -wd383,981,1419,1572" CCFLAGS="-O3 -xHOST -std=c99 -ip -Wbrief -Wall -vec-report0 -openmp-report0 -wd383,981,1419,1572"
@ -85,20 +82,20 @@ case $system in
;; ;;
esac esac
CCFLAGS="$CCFLAGS $OPENMP_CFLAGS"
if test $ENABLE_DEBUG = yes; then if test $ENABLE_DEBUG = yes; then
CCFLAGS="$CCFLAGS -g" DEBUG_CFLAGS="-g"
fi fi
if test $ENABLE_PIC = yes; then if test $ENABLE_PIC = yes; then
CCFLAGS="$CCFLAGS -fPIC" PIC_CFLAGS="-fPIC"
fi fi
if test $ENABLE_MPI = yes; then if test $ENABLE_MPI = yes; then
CCFLAGS="$CCFLAGS -DUSE_MPI" MPI_CFLAGS="-DUSE_MPI"
fi fi
CCFLAGS="$CCFLAGS $DEBUG_CFLAGS $OPENMP_CFLAGS $PIC_CFLAGS $MPI_CFLAGS"
CCFLAGS_NO_C="$CCFLAGS $CPPFLAGS" CCFLAGS_NO_C="$CCFLAGS $CPPFLAGS"
LDCCFLAGS="$LDFLAGS $CCFLAGS" LDCCFLAGS="$LDFLAGS $CCFLAGS"
@ -107,6 +104,10 @@ AC_SUBST(SILENT_RULE)
AC_SUBST(CC) AC_SUBST(CC)
AC_SUBST(CCFLAGS_NO_C) AC_SUBST(CCFLAGS_NO_C)
AC_SUBST(LDCCFLAGS) AC_SUBST(LDCCFLAGS)
AC_SUBST(DEBUG_CFLAGS)
AC_SUBST(MPI_CFLAGS)
AC_SUBST(OPENMP_CFLAGS)
AC_SUBST(PIC_CFLAGS)
AC_SUBST(ARCREATE) AC_SUBST(ARCREATE)
AC_OUTPUT(config/config.auto) AC_OUTPUT(config/config.auto)

286
external/sharp/fortran/sharp.f90 vendored Normal file
View File

@ -0,0 +1,286 @@
module sharp
use iso_c_binding
implicit none
! alm_info flags
integer, parameter :: SHARP_PACKED = 1
! sharp job types
enum, bind(c)
enumerator :: SHARP_YtW = 0
enumerator :: SHARP_Y = 1
enumerator :: SHARP_Yt = 2
enumerator :: SHARP_WY = 3
enumerator :: SHARP_ALM2MAP_DERIV1 = 4
end enum
! sharp job flags
integer, parameter :: SHARP_DP = ISHFT(1, 4)
integer, parameter :: SHARP_ADD = ISHFT(1, 5)
integer, parameter :: SHARP_REAL_HARMONICS = ISHFT(1, 6)
integer, parameter :: SHARP_NO_FFT = ISHFT(1, 7)
type sharp_geom_info
type(c_ptr) :: handle
integer(c_intptr_t) :: n_local
end type sharp_geom_info
type sharp_alm_info
type(c_ptr) :: handle
integer(c_intptr_t) :: n_local
end type sharp_alm_info
interface
! alm_info
subroutine sharp_make_general_alm_info( &
lmax, nm, stride, mval, mvstart, flags, alm_info) bind(c)
use iso_c_binding
integer(c_int), value, intent(in) :: lmax, nm, stride, flags
integer(c_int), intent(in) :: mval(nm)
integer(c_intptr_t), intent(in) :: mvstart(nm)
type(c_ptr), intent(out) :: alm_info
end subroutine sharp_make_general_alm_info
subroutine c_sharp_make_mmajor_real_packed_alm_info( &
lmax, stride, nm, ms, alm_info) bind(c, name='sharp_make_mmajor_real_packed_alm_info')
use iso_c_binding
integer(c_int), value, intent(in) :: lmax, nm, stride
integer(c_int), intent(in), optional :: ms(nm)
type(c_ptr), intent(out) :: alm_info
end subroutine c_sharp_make_mmajor_real_packed_alm_info
function c_sharp_alm_count(alm_info) bind(c, name='sharp_alm_count')
use iso_c_binding
integer(c_intptr_t) :: c_sharp_alm_count
type(c_ptr), value, intent(in) :: alm_info
end function c_sharp_alm_count
subroutine c_sharp_destroy_alm_info(alm_info) bind(c, name='sharp_destroy_alm_info')
use iso_c_binding
type(c_ptr), value :: alm_info
end subroutine c_sharp_destroy_alm_info
! geom_info
subroutine sharp_make_subset_healpix_geom_info ( &
nside, stride, nrings, rings, weight, geom_info) bind(c)
use iso_c_binding
integer(c_int), value, intent(in) :: nside, stride, nrings
integer(c_int), intent(in), optional :: rings(nrings)
real(c_double), intent(in), optional :: weight(2 * nside)
type(c_ptr), intent(out) :: geom_info
end subroutine sharp_make_subset_healpix_geom_info
subroutine c_sharp_destroy_geom_info(geom_info) bind(c, name='sharp_destroy_geom_info')
use iso_c_binding
type(c_ptr), value :: geom_info
end subroutine c_sharp_destroy_geom_info
function c_sharp_map_size(info) bind(c, name='sharp_map_size')
use iso_c_binding
integer(c_intptr_t) :: c_sharp_map_size
type(c_ptr), value :: info
end function c_sharp_map_size
! execute
subroutine c_sharp_execute(type, spin, alm, map, geom_info, alm_info, ntrans, &
flags, time, opcnt) bind(c, name='sharp_execute')
use iso_c_binding
integer(c_int), value :: type, spin, ntrans, flags
type(c_ptr), value :: alm_info, geom_info
real(c_double), intent(out), optional :: time
integer(c_long_long), intent(out), optional :: opcnt
type(c_ptr), intent(in) :: alm(*), map(*)
end subroutine c_sharp_execute
subroutine c_sharp_execute_mpi(comm, type, spin, alm, map, geom_info, alm_info, ntrans, &
flags, time, opcnt) bind(c, name='sharp_execute_mpi_fortran')
use iso_c_binding
integer(c_int), value :: comm, type, spin, ntrans, flags
type(c_ptr), value :: alm_info, geom_info
real(c_double), intent(out), optional :: time
integer(c_long_long), intent(out), optional :: opcnt
type(c_ptr), intent(in) :: alm(*), map(*)
end subroutine c_sharp_execute_mpi
! Legendre transforms
subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) &
bind(c, name='sharp_legendre_transform')
use iso_c_binding
integer(c_intptr_t), value :: lmax, nx
real(c_double) :: bl(lmax + 1), x(nx), out(nx)
real(c_double), optional :: recfac(lmax + 1)
end subroutine c_sharp_legendre_transform
subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) &
bind(c, name='sharp_legendre_transform_s')
use iso_c_binding
integer(c_intptr_t), value :: lmax, nx
real(c_float) :: bl(lmax + 1), x(nx), out(nx)
real(c_float), optional :: recfac(lmax + 1)
end subroutine c_sharp_legendre_transform_s
end interface
interface sharp_execute
module procedure sharp_execute_d
end interface
interface sharp_legendre_transform
module procedure sharp_legendre_transform_d, sharp_legendre_transform_s
end interface sharp_legendre_transform
contains
! alm info
! if ms is not passed, we default to using m=0..lmax.
subroutine sharp_make_mmajor_real_packed_alm_info(lmax, ms, alm_info)
use iso_c_binding
integer(c_int), value, intent(in) :: lmax
integer(c_int), intent(in), optional :: ms(:)
type(sharp_alm_info), intent(out) :: alm_info
!--
integer(c_int), allocatable :: ms_copy(:)
integer(c_int) :: nm
if (present(ms)) then
nm = size(ms)
allocate(ms_copy(nm))
ms_copy = ms
call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, nm, ms_copy, alm_info=alm_info%handle)
deallocate(ms_copy)
else
call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, lmax + 1, alm_info=alm_info%handle)
end if
alm_info%n_local = c_sharp_alm_count(alm_info%handle)
end subroutine sharp_make_mmajor_real_packed_alm_info
subroutine sharp_destroy_alm_info(alm_info)
use iso_c_binding
type(sharp_alm_info), intent(inout) :: alm_info
call c_sharp_destroy_alm_info(alm_info%handle)
alm_info%handle = c_null_ptr
end subroutine sharp_destroy_alm_info
! geom info
subroutine sharp_make_healpix_geom_info(nside, rings, weight, geom_info)
integer(c_int), value :: nside
integer(c_int), optional :: rings(:)
real(c_double), intent(in), optional :: weight(2 * nside)
type(sharp_geom_info), intent(out) :: geom_info
!--
integer(c_int) :: nrings
integer(c_int), allocatable :: rings_copy(:)
if (present(rings)) then
nrings = size(rings)
allocate(rings_copy(nrings))
rings_copy = rings
call sharp_make_subset_healpix_geom_info(nside, 1, nrings, rings_copy, &
weight, geom_info%handle)
deallocate(rings_copy)
else
call sharp_make_subset_healpix_geom_info(nside, 1, nrings=4 * nside - 1, &
weight=weight, geom_info=geom_info%handle)
end if
geom_info%n_local = c_sharp_map_size(geom_info%handle)
end subroutine sharp_make_healpix_geom_info
subroutine sharp_destroy_geom_info(geom_info)
use iso_c_binding
type(sharp_geom_info), intent(inout) :: geom_info
call c_sharp_destroy_geom_info(geom_info%handle)
geom_info%handle = c_null_ptr
end subroutine sharp_destroy_geom_info
! Currently the only mode supported is stacked (not interleaved) maps.
!
! Note that passing the exact dimension of alm/map is necesarry, it
! prevents the caller from doing too crazy slicing prior to pass array
! in...
!
! Usage:
!
! The alm array must have shape exactly alm(alm_info%n_local, nmaps)
! The maps array must have shape exactly map(map_info%n_local, nmaps).
subroutine sharp_execute_d(type, spin, nmaps, alm, alm_info, map, geom_info, &
add, time, opcnt, comm)
use iso_c_binding
use mpi
implicit none
integer(c_int), value :: type, spin, nmaps
integer(c_int), optional :: comm
logical, value, optional :: add ! should add instead of replace out
type(sharp_alm_info) :: alm_info
type(sharp_geom_info) :: geom_info
real(c_double), intent(out), optional :: time
integer(c_long_long), intent(out), optional :: opcnt
real(c_double), target, intent(inout) :: alm(0:alm_info%n_local - 1, 1:nmaps)
real(c_double), target, intent(inout) :: map(0:geom_info%n_local - 1, 1:nmaps)
!--
integer(c_int) :: mod_flags, ntrans, k
type(c_ptr), target :: alm_ptr(nmaps)
type(c_ptr), target :: map_ptr(nmaps)
mod_flags = SHARP_DP
if (present(add) .and. add) then
mod_flags = or(mod_flags, SHARP_ADD)
end if
if (spin == 0) then
ntrans = nmaps
else
ntrans = nmaps / 2
end if
! Set up pointer table to access maps
alm_ptr(:) = c_null_ptr
map_ptr(:) = c_null_ptr
do k = 1, nmaps
if (alm_info%n_local > 0) alm_ptr(k) = c_loc(alm(0, k))
if (geom_info%n_local > 0) map_ptr(k) = c_loc(map(0, k))
end do
if (present(comm)) then
call c_sharp_execute_mpi(comm, type, spin, alm_ptr, map_ptr, &
geom_info=geom_info%handle, &
alm_info=alm_info%handle, &
ntrans=ntrans, &
flags=mod_flags, &
time=time, &
opcnt=opcnt)
else
call c_sharp_execute(type, spin, alm_ptr, map_ptr, &
geom_info=geom_info%handle, &
alm_info=alm_info%handle, &
ntrans=ntrans, &
flags=mod_flags, &
time=time, &
opcnt=opcnt)
end if
end subroutine sharp_execute_d
subroutine sharp_legendre_transform_d(bl, x, out)
use iso_c_binding
real(c_double) :: bl(:)
real(c_double) :: x(:), out(size(x))
!--
integer(c_intptr_t) :: lmax, nx
call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_intptr_t), &
x=x, out=out, nx=int(size(x), c_intptr_t))
end subroutine sharp_legendre_transform_d
subroutine sharp_legendre_transform_s(bl, x, out)
use iso_c_binding
real(c_float) :: bl(:)
real(c_float) :: x(:), out(size(x))
!--
integer(c_intptr_t) :: lmax, nx
call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_intptr_t), &
x=x, out=out, nx=int(size(x), c_intptr_t))
end subroutine sharp_legendre_transform_s
end module

84
external/sharp/fortran/test_sharp.f90 vendored Normal file
View File

@ -0,0 +1,84 @@
program test_sharp
use mpi
use sharp
use iso_c_binding, only : c_ptr, c_double
implicit none
integer, parameter :: lmax = 2, nside = 2
type(sharp_alm_info) :: alm_info
type(sharp_geom_info) :: geom_info
real(c_double), dimension(0:(lmax + 1)**2 - 1, 1:1) :: alm
real(c_double), dimension(0:12*nside**2 - 1, 1:1) :: map
integer(c_int), dimension(1:lmax + 1) :: ms
integer(c_int), dimension(1:4 * nside - 1) :: rings
integer(c_int) :: nm, m, nrings, iring
integer :: nodecount, rank, ierr
call MPI_Init(ierr)
call MPI_Comm_size(MPI_COMM_WORLD, nodecount, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
nm = 0
do m = rank, lmax, nodecount
nm = nm + 1
ms(nm) = m
end do
nrings = 0
do iring = rank + 1, 4 * nside - 1, nodecount
nrings = nrings + 1
rings(nrings) = iring
end do
alm = 0
map = 0
if (rank == 0) then
alm(0, 1) = 1
end if
print *, ms(1:nm)
call sharp_make_mmajor_real_packed_alm_info(lmax, ms=ms(1:nm), alm_info=alm_info)
print *, 'alm_info%n_local', alm_info%n_local
call sharp_make_healpix_geom_info(nside, rings=rings(1:nrings), geom_info=geom_info)
print *, 'geom_info%n_local', geom_info%n_local
print *, 'execute'
call sharp_execute(SHARP_Y, 0, 1, alm, alm_info, map, geom_info, comm=MPI_COMM_WORLD)
print *, alm
print *, map
call sharp_destroy_alm_info(alm_info)
call sharp_destroy_geom_info(geom_info)
print *, 'DONE'
call MPI_Finalize(ierr)
print *, 'LEGENDRE TRANSFORMS'
call test_legendre_transforms()
contains
subroutine test_legendre_transforms()
integer, parameter :: lmax = 20, nx=10
real(c_double) :: bl(0:lmax)
real(c_double) :: x(nx), out(nx)
real(c_float) :: out_s(nx)
!--
integer :: l, i
do l = 0, lmax
bl(l) = 1.0 / real(l + 1, c_double)
end do
do i = 1, nx
x(i) = 1 / real(i, c_double)
end do
out = 0
call sharp_legendre_transform(bl, x, out)
print *, out
call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s)
print *, out_s
end subroutine test_legendre_transforms
end program test_sharp

View File

@ -122,7 +122,7 @@ void kill_real_plan (real_plan plan);
- on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> - on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt>
(a total of \a length values). */ (a total of \a length values). */
void real_plan_forward_fftpack (real_plan plan, double *data); void real_plan_forward_fftpack (real_plan plan, double *data);
/*! Computes a real forward FFT on \a data, using \a plan /*! Computes a real backward FFT on \a data, using \a plan
and assuming the FFTPACK storage scheme: and assuming the FFTPACK storage scheme:
- on entry, \a data has the form <tt>r0, r1, i1, r2, i2, ...</tt> - on entry, \a data has the form <tt>r0, r1, i1, r2, i2, ...</tt>
(a total of \a length values); (a total of \a length values);
@ -143,8 +143,7 @@ void real_plan_backward_fftw (real_plan plan, double *data);
- on entry, \a data has the form <tt>r0, [ignored], r1, [ignored], ..., - on entry, \a data has the form <tt>r0, [ignored], r1, [ignored], ...,
r[length-1], [ignored]</tt>; r[length-1], [ignored]</tt>;
- on exit, it has the form <tt>r0, i0, r1, i1, ..., - on exit, it has the form <tt>r0, i0, r1, i1, ...,
r[length-1], i[length-1]</tt>. r[length-1], i[length-1]</tt>. */
*/
void real_plan_forward_c (real_plan plan, double *data); void real_plan_forward_c (real_plan plan, double *data);
/*! Computes a real backward FFT on \a data, using \a plan /*! Computes a real backward FFT on \a data, using \a plan
and assuming a full-complex storage scheme: and assuming a full-complex storage scheme:

View File

@ -7,10 +7,9 @@
/*! \page introduction Introduction to libsharp /*! \page introduction Introduction to libsharp
"SHARP" is an acronym for <i>Performant Spherical Harmonic Transforms</i>. "SHARP" is an acronym for <i>Spherical HARmonic Package</i>.
All user-visible data types and functions in this library start with All user-visible data types and functions in this library start with
the prefix "sharp_", or with "sharps_" and "sharpd_" for single- and the prefix "sharp_" to avoid pollution of the global C namespace.
double precision variants, respectively.
<i>libsharp</i>'s main functionality is the conversion between <i>maps</i> <i>libsharp</i>'s main functionality is the conversion between <i>maps</i>
on the sphere and <i>spherical harmonic coefficients</i> (or <i>a_lm</i>). on the sphere and <i>spherical harmonic coefficients</i> (or <i>a_lm</i>).
@ -57,7 +56,7 @@
for generating often-used pixelisations like ECP grids, Gaussian grids, for generating often-used pixelisations like ECP grids, Gaussian grids,
and Healpix grids. and Healpix grids.
Currently, SHARP supports the following kinds of transforms: Currently, libsharp supports the following kinds of transforms:
<ul> <ul>
<li>scalar a_lm to map</li> <li>scalar a_lm to map</li>
<li>scalar map to a_lm</li> <li>scalar map to a_lm</li>
@ -68,10 +67,10 @@
<li>scalar a_lm to maps of first derivatives</li> <li>scalar a_lm to maps of first derivatives</li>
</ul> </ul>
SHARP supports shared-memory parallelisation via OpenMP; this feature will libsharp supports shared-memory parallelisation via OpenMP; this feature will
be automatically enabled if the compiler supports it. be automatically enabled if the compiler supports it.
SHARP will also make use of SSE2 and AVX instructions when compiled for a Libsharp will also make use of SSE2 and AVX instructions when compiled for a
platform known to support them. platform known to support them.
Support for MPI-parallel transforms is also available; in this mode, Support for MPI-parallel transforms is also available; in this mode,
@ -83,12 +82,4 @@
single-precision transforms will most likely not be faster than their single-precision transforms will most likely not be faster than their
double-precision counterparts, but they will require significantly less double-precision counterparts, but they will require significantly less
memory. memory.
Two example and benchmark programs are distributed with SHARP:
<ul>
<li>sharp_test.c checks the accuracy of the (iterative) map analysis
algorithm</li>
<li>sharp_bench.c determines the quickest transform strategy for a given
SHT</li>
</ul>
*/ */

View File

@ -7,14 +7,14 @@ FULL_INCLUDE+= -I$(SD)
HDR_$(PKG):=$(SD)/*.h HDR_$(PKG):=$(SD)/*.h
LIB_$(PKG):=$(LIBDIR)/libsharp.a LIB_$(PKG):=$(LIBDIR)/libsharp.a
BIN:=sharp_test sharp_acctest sharp_test_mpi sharp_bench sharp_bench2 BIN:=sharp_testsuite
LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o
ALLOBJ:=$(LIBOBJ) sharp_test.o sharp_acctest.o sharp_test_mpi.o sharp_bench.o sharp_bench2.o ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
LIBOBJ:=$(LIBOBJ:%=$(OD)/%) LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
ALLOBJ:=$(ALLOBJ:%=$(OD)/%) ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils) ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c $(OD)/sharp_core.o: $(SD)/sharp_core_inchelper.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(OD)/sharp.o: $(SD)/sharp_mpi.c
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils) BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)

File diff suppressed because it is too large Load Diff

View File

@ -39,5 +39,7 @@
#include <complex.h> #include <complex.h>
#include "sharp_lowlevel.h" #include "sharp_lowlevel.h"
#include "sharp_legendre.h"
#include "sharp_legendre_roots.h"
#endif #endif

View File

@ -1,267 +0,0 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_acctest.c
Systematic accuracy test for libsharp.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
ptrdiff_t nalms, int ncomp)
{
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0;
for (ptrdiff_t m=0; m<nalms; ++m)
{
double x=creal(alm[i][m])-creal(alm2[i][m]),
y=cimag(alm[i][m])-cimag(alm2[i][m]);
sum+=x*x+y*y;
sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
sum=sqrt(sum/nalms);
sum2=sqrt(sum2/nalms);
UTIL_ASSERT((maxdiff<1e-10)&&(sum/sum2<1e-10),"error");
}
}
static void check_sign_scale(void)
{
int lmax=50;
int mmax=lmax;
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
/* flip theta to emulate the "old" Gaussian grid geometry */
for (int i=0; i<tinfo->npairs; ++i)
{
const double pi=3.141592653589793238462643383279502884197;
tinfo->pair[i].r1.cth=-tinfo->pair[i].r1.cth;
tinfo->pair[i].r2.cth=-tinfo->pair[i].r2.cth;
tinfo->pair[i].r1.theta=pi-tinfo->pair[i].r1.theta;
tinfo->pair[i].r2.theta=pi-tinfo->pair[i].r2.theta;
}
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
for (int ntrans=1; ntrans<10; ++ntrans)
{
double **map;
ALLOC2D(map,double,2*ntrans,npix);
dcmplx **alm;
ALLOC2D(alm,dcmplx,2*ntrans,nalms);
for (int i=0; i<2*ntrans; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
sharp_execute(SHARP_ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[it][0 ], 3.588246976618616912e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ], 2.750897760535633285e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2], 3.137704477368562905e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-8.405730859837063917e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-2.398026536095463346e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-1.398186224727334448e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.456676000884031197e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.516249174408820863e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-3.173406200299964119e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP_DERIV1,1,0,&alm[0],&map[0],tinfo,alms,ntrans,1,
0,NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-6.859393905369091105e-01,1e-11),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.103947835973212364e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.092463246472086439e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-1.411433220713928165e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
"error");
}
DEALLOC2D(map);
DEALLOC2D(alm);
}
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int nv)
{
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
srand(4);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,
NULL,NULL);
sharp_execute(SHARP_MAP2ALM,spin,0,&alm2[0],&map[0],tinfo,alms,ntrans,1,nv,
NULL,NULL);
measure_errors(alm,alm2,nalms,ncomp);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(void)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
sharp_module_startup("sharp_acctest",1,1,"",1);
int lmax=127;
printf("Checking signs and scales.\n");
check_sign_scale();
printf("Passed.\n\n");
printf("Testing map analysis accuracy.\n");
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
for (int nv=1; nv<=6; ++nv)
for (int ntrans=1; ntrans<=6; ++ntrans)
{
check_accuracy(tinfo,lmax,lmax,npix,0,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,1,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,2,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,3,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,30,ntrans,nv);
}
sharp_destroy_geom_info(tinfo);
printf("Passed.\n\n");
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

View File

@ -25,7 +25,7 @@
/*! \file sharp_almhelpers.c /*! \file sharp_almhelpers.c
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2008-2011 Max-Planck-Society * Copyright (C) 2008-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -41,7 +41,8 @@ void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
info->mval = RALLOC(int,mmax+1); info->mval = RALLOC(int,mmax+1);
info->mvstart = RALLOC(ptrdiff_t,mmax+1); info->mvstart = RALLOC(ptrdiff_t,mmax+1);
info->stride = stride; info->stride = stride;
int tval = 2*lmax+1; info->flags = 0;
ptrdiff_t tval = 2*lmax+1;
for (ptrdiff_t m=0; m<=mmax; ++m) for (ptrdiff_t m=0; m<=mmax; ++m)
{ {
info->mval[m] = m; info->mval[m] = m;
@ -59,6 +60,7 @@ void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
info->mval = RALLOC(int,mmax+1); info->mval = RALLOC(int,mmax+1);
info->mvstart = RALLOC(ptrdiff_t,mmax+1); info->mvstart = RALLOC(ptrdiff_t,mmax+1);
info->stride = stride; info->stride = stride;
info->flags = 0;
for (ptrdiff_t m=0; m<=mmax; ++m) for (ptrdiff_t m=0; m<=mmax; ++m)
{ {
info->mval[m] = m; info->mval[m] = m;
@ -66,3 +68,27 @@ void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
} }
*alm_info = info; *alm_info = info;
} }
void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
int nm, const int *ms, sharp_alm_info **alm_info)
{
ptrdiff_t idx;
int f;
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = nm;
info->mval = RALLOC(int,nm);
info->mvstart = RALLOC(ptrdiff_t,nm);
info->stride = stride;
info->flags = SHARP_PACKED | SHARP_REAL_HARMONICS;
idx = 0; /* tracks the number of 'consumed' elements so far; need to correct by m */
for (int im=0; im!=nm; ++im)
{
int m=(ms==NULL)?im:ms[im];
f = (m==0) ? 1 : 2;
info->mval[im] = m;
info->mvstart[im] = stride * (idx - f * m);
idx += f * (lmax + 1 - m);
}
*alm_info = info;
}

View File

@ -50,6 +50,14 @@ void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride, void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info); sharp_alm_info **alm_info);
/*! Initialises alm_info for mmajor, real, packed spherical harmonics.
Pass \a mmax + 1 to nm and NULL to \a ms in order to use everything;
otherwise you can pick a subset of m to process (should only be used
for MPI parallelization).
\ingroup almgroup */
void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
int nm, const int *ms, sharp_alm_info **alm_info);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,149 +0,0 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_bench.c
Copyright (C) 2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static void bench_sht (int spin, int nv, sharp_jobtype type,
int ntrans, double *time, unsigned long long *opcnt)
{
int lmax=2047;
int mmax=128;
int nrings=512;
int ppring=1024;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_geom_info *tinfo;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
SET_ARRAY(map[0],0,npix*ncomp,0.);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
int nruns=0;
*time=1e30;
*opcnt=1000000000000000;
do
{
double jtime;
unsigned long long jopcnt;
sharp_execute(type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,&jtime,
&jopcnt);
if (jopcnt<*opcnt) *opcnt=jopcnt;
if (jtime<*time) *time=jtime;
}
while (++nruns < 4);
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
int main(void)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
sharp_module_startup("sharp_bench",1,1,"",1);
printf("Benchmarking SHTs.\n\n");
FILE *fp=fopen("sharp_oracle.inc","w");
UTIL_ASSERT(fp, "failed to open oracle file for writing");
fprintf(fp,"static const int maxtr = 6;\n");
fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
const char *shtname[]={"map2alm","alm2map","a2mder1"};
for (int ntr=1; ntr<=6; ++ntr)
{
fprintf(fp,"{");
for (int spin=0; spin<=2; spin+=2)
{
fprintf(fp,"{");
for (sharp_jobtype type=SHARP_MAP2ALM; type<=SHARP_ALM2MAP_DERIV1; ++type)
{
if ((type==SHARP_ALM2MAP_DERIV1) && (spin==0))
fprintf(fp,"-1");
else
{
int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
unsigned long long opmin=1000000000000000, op;
double tmin=1e30;
double *time=RALLOC(double,sharp_get_nv_max()+1);
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
{
bench_sht (spin,nv,type,ntr,&time[nv],&op);
if (op<opmin) opmin=op;
if (time[nv]<tmin)
{ tmin=time[nv]; nvbest=nv; }
}
printf("nt: %d %s spin: %d nv: %d time: %6.3f perf: %6.3f"
" dev[%d]: %6.2f%%\n",ntr,shtname[type],
spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
(time[nvoracle]-tmin)/tmin*100.);
DEALLOC(time);
fprintf(fp,"%d",nvbest);
}
if (type!=SHARP_ALM2MAP_DERIV1) fprintf(fp,",");
}
fprintf(fp,(spin==0)?"},":"}");
printf("\n");
}
fprintf(fp,(ntr<6)?"},\n":"}\n");
}
fprintf(fp,"};\n");
fclose(fp);
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

View File

@ -1,223 +0,0 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_bench2.c
Copyright (C) 2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#if (defined(_OPENMP) && defined(USE_MPI))
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <mpi.h>
#include "sharp_mpi.h"
#include "sharp.h"
#include "sharp_vecutil.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
#include "memusage.h"
typedef complex double dcmplx;
int ntasks, mytask;
static unsigned long long totalops (unsigned long long val)
{
unsigned long long tmp;
MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static double maxTime (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return tmp;
}
static double totalMem (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static void reduce_alm_info(sharp_alm_info *ainfo)
{
int nmnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
{
ainfo->mval[nmnew]=ainfo->mval[i];
ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
}
ainfo->nm=nmnew;
}
static void reduce_geom_info(sharp_geom_info *ginfo)
{
int npairsnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
{
ginfo->pair[npairsnew]=ginfo->pair[i];
ginfo->pair[npairsnew].r1.ofs=ofs;
ofs+=ginfo->pair[npairsnew].r1.nph;
ginfo->pair[npairsnew].r2.ofs=ofs;
if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
}
ginfo->npairs=npairsnew;
}
static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
{
ptrdiff_t res=0;
for (int i=0; i<ainfo->nm; ++i)
res += ainfo->lmax-ainfo->mval[i]+1;
return res;
}
static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
{
ptrdiff_t res=0;
for (int i=0; i<ginfo->npairs; ++i)
{
res += ginfo->pair[i].r1.nph;
if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
}
return res;
}
int main(int argc, char **argv)
{
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
int master=(mytask==0);
sharp_module_startup("sharp_bench2",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <a2m/m2a> <spin> <ntrans>",0);
int lmax=atoi(argv[2]);
sharp_jobtype jtype = (strcmp(argv[4],"a2m")==0) ?
SHARP_ALM2MAP : SHARP_MAP2ALM;
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
sharp_geom_info *tinfo;
ptrdiff_t npix=0;
int geom2=0;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=geom2=lmax+1;
int ppring=atoi(argv[3]);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=geom2=2*lmax+2;
int ppring=atoi(argv[3]);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
geom2=4*nside-1;
sharp_make_healpix_geom_info (nside, 1, &tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
int mmax=lmax;
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
ptrdiff_t nalms=get_nalms(alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int n=0; n<ncomp; ++n)
{
for (int i=0; i<npix; ++i) map[n][i]=1;
for (int i=0; i<nalms; ++i) alm[n][i]=1;
}
double time=1e20;
unsigned long long opcnt=0;
for (int ntries=0; (ntries<2)||(ntries*time<5); ++ntries)
{
double ltime;
unsigned long long lopcnt;
sharp_execute_mpi(MPI_COMM_WORLD,jtype,spin,0,&alm[0],&map[0],
tinfo,alms,ntrans,1,0,&ltime,&lopcnt);
ltime=maxTime(ltime);
if (ltime<time) { time=ltime; opcnt=totalops(lopcnt); }
}
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
double mHWM=totalMem(VmHWM());
int nomp=omp_get_max_threads();
if (master)
printf("%-12s %-7s %-3s %2d %d %2d %3d %5d %5d %1d %.2e %7.2f %9.2f\n",
getenv("HOST"),argv[1],argv[4],spin,VLEN,nomp,ntasks,lmax,geom2,ntrans,
time,opcnt/(time*1e9),mHWM/(1<<20));
MPI_Finalize();
return 0;
}
#else
#include "c_utils.h"
int main(void)
{ UTIL_FAIL("Need OpenMP and MPI"); return 1; }
#endif

View File

@ -25,7 +25,7 @@
/* \file sharp_complex_hacks.h /* \file sharp_complex_hacks.h
* support for converting vector types and complex numbers * support for converting vector types and complex numbers
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012,2013 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -132,4 +132,18 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
#endif #endif
#if (VLEN==8)
static inline complex double vhsum_cmplx(Tv a, Tv b)
{ return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
complex double * restrict c1, complex double * restrict c2)
{
*c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
*c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
}
#endif
#endif #endif

View File

@ -25,7 +25,7 @@
/*! \file sharp_core.c /*! \file sharp_core.c
* Computational core * Computational core
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -41,6 +41,7 @@
typedef complex double dcmplx; typedef complex double dcmplx;
// must be in the range [0;6]
#define MAXJOB_SPECIAL 2 #define MAXJOB_SPECIAL 2
#define XCONCAT2(a,b) a##_##b #define XCONCAT2(a,b) a##_##b
@ -49,188 +50,188 @@ typedef complex double dcmplx;
#define CONCAT3(a,b,c) XCONCAT3(a,b,c) #define CONCAT3(a,b,c) XCONCAT3(a,b,c)
#define nvec 1 #define nvec 1
#include "sharp_inchelper1.inc.c" #include "sharp_core_inchelper.c"
#undef nvec #undef nvec
#define nvec 2 #define nvec 2
#include "sharp_inchelper1.inc.c" #include "sharp_core_inchelper.c"
#undef nvec #undef nvec
#define nvec 3 #define nvec 3
#include "sharp_inchelper1.inc.c" #include "sharp_core_inchelper.c"
#undef nvec #undef nvec
#define nvec 4 #define nvec 4
#include "sharp_inchelper1.inc.c" #include "sharp_core_inchelper.c"
#undef nvec #undef nvec
#define nvec 5 #define nvec 5
#include "sharp_inchelper1.inc.c" #include "sharp_core_inchelper.c"
#undef nvec #undef nvec
#define nvec 6 #define nvec 6
#include "sharp_inchelper1.inc.c" #include "sharp_core_inchelper.c"
#undef nvec #undef nvec
void inner_loop (sharp_job *job, const int *ispair,const double *cth, void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx) const int *mlim)
{ {
int njobs=job->ntrans; int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
if (njobs<=MAXJOB_SPECIAL) if (njobs<=MAXJOB_SPECIAL)
{ {
switch (njobs*16+job->nv) switch (njobs*16+nv)
{ {
#if (MAXJOB_SPECIAL>=1) #if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
case 0x11: case 0x11:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x12: case 0x12:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x13: case 0x13:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x14: case 0x14:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x15: case 0x15:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x16: case 0x16:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
#endif #endif
#if (MAXJOB_SPECIAL>=2) #if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
case 0x21: case 0x21:
CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x22: case 0x22:
CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x23: case 0x23:
CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x24: case 0x24:
CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x25: case 0x25:
CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x26: case 0x26:
CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
#endif #endif
#if (MAXJOB_SPECIAL>=3) #if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
case 0x31: case 0x31:
CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x32: case 0x32:
CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x33: case 0x33:
CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x34: case 0x34:
CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x35: case 0x35:
CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x36: case 0x36:
CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
#endif #endif
#if (MAXJOB_SPECIAL>=4) #if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
case 0x41: case 0x41:
CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x42: case 0x42:
CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x43: case 0x43:
CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x44: case 0x44:
CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x45: case 0x45:
CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x46: case 0x46:
CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
#endif #endif
#if (MAXJOB_SPECIAL>=5) #if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
case 0x51: case 0x51:
CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x52: case 0x52:
CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x53: case 0x53:
CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x54: case 0x54:
CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x55: case 0x55:
CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x56: case 0x56:
CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
#endif #endif
#if (MAXJOB_SPECIAL>=6) #if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
case 0x61: case 0x61:
CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x62: case 0x62:
CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x63: case 0x63:
CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x64: case 0x64:
CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x65: case 0x65:
CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
case 0x66: case 0x66:
CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return; return;
#endif #endif
} }
} }
#if (MAXJOB_SPECIAL<6) #if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
else else
{ {
switch (job->nv) switch (nv)
{ {
case 1: case 1:
CONCAT2(inner_loop,1) CONCAT2(inner_loop,1)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return; return;
case 2: case 2:
CONCAT2(inner_loop,2) CONCAT2(inner_loop,2)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return; return;
case 3: case 3:
CONCAT2(inner_loop,3) CONCAT2(inner_loop,3)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return; return;
case 4: case 4:
CONCAT2(inner_loop,4) CONCAT2(inner_loop,4)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return; return;
case 5: case 5:
CONCAT2(inner_loop,5) CONCAT2(inner_loop,5)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return; return;
case 6: case 6:
CONCAT2(inner_loop,6) CONCAT2(inner_loop,6)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return; return;
} }
} }

View File

@ -25,7 +25,7 @@
/*! \file sharp_core.h /*! \file sharp_core.h
* Interface for the computational core * Interface for the computational core
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -41,7 +41,7 @@ extern "C" {
void inner_loop (sharp_job *job, const int *ispair,const double *cth, void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx); const int *mlim);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -70,31 +70,31 @@ static inline Tb Y(Tbprod)(Tb a, Tb b)
static inline void Y(Tbmuleq)(Tb * restrict a, Tb b) static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
{ for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); } { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
static inline void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
double maxval) double maxval)
{ {
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
for (int i=0;i<nvec; ++i) for (int i=0;i<nvec; ++i)
{ {
Tv mask = vgt(vabs(val->v[i]),vfmax); Tm mask = vgt(vabs(val->v[i]),vfmax);
while (vanyTrue(mask)) while (vanyTrue(mask))
{ {
vmuleq(val->v[i],vblend(mask,vfsmall,vone)); vmuleq_mask(mask,val->v[i],vfsmall);
vaddeq(scale->v[i],vblend(mask,vone,vzero)); vaddeq_mask(mask,scale->v[i],vone);
mask = vgt(vabs(val->v[i]),vfmax); mask = vgt(vabs(val->v[i]),vfmax);
} }
mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
while (vanyTrue(mask)) while (vanyTrue(mask))
{ {
vmuleq(val->v[i],vblend(mask,vfbig,vone)); vmuleq_mask(mask,val->v[i],vfbig);
vsubeq(scale->v[i],vblend(mask,vone,vzero)); vsubeq_mask(mask,scale->v[i],vone);
mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
} }
} }
} }
static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd, static void Y(mypow) (Tb val, int npow, Tb * restrict resd,
Tb * restrict ress) Tb * restrict ress)
{ {
Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
@ -131,13 +131,13 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
int did_scale=0; int did_scale=0;
for (int i=0;i<nvec; ++i) for (int i=0;i<nvec; ++i)
{ {
Tv mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol)); Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
if (vanyTrue(mask)) if (vanyTrue(mask))
{ {
did_scale=1; did_scale=1;
Tv fact = vblend(mask,vload(sharp_fsmall),vone); vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact); vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
vaddeq(scale->v[i],vblend(mask,vone,vzero)); vaddeq_mask(mask,scale->v[i],vone);
} }
} }
return did_scale; return did_scale;
@ -146,29 +146,29 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
static inline int Y(TballLt)(Tb a,double b) static inline int Y(TballLt)(Tb a,double b)
{ {
Tv vb=vload(b); Tv vb=vload(b);
Tv res=vlt(a.v[0],vb); Tm res=vlt(a.v[0],vb);
for (int i=1; i<nvec; ++i) for (int i=1; i<nvec; ++i)
res=vand(res,vlt(a.v[i],vb)); res=vand_mask(res,vlt(a.v[i],vb));
return vallTrue(res); return vallTrue(res);
} }
static inline int Y(TballGt)(Tb a,double b) static inline int Y(TballGt)(Tb a,double b)
{ {
Tv vb=vload(b); Tv vb=vload(b);
Tv res=vgt(a.v[0],vb); Tm res=vgt(a.v[0],vb);
for (int i=1; i<nvec; ++i) for (int i=1; i<nvec; ++i)
res=vand(res,vgt(a.v[i],vb)); res=vand_mask(res,vgt(a.v[i],vb));
return vallTrue(res); return vallTrue(res);
} }
static inline int Y(TballGe)(Tb a,double b) static inline int Y(TballGe)(Tb a,double b)
{ {
Tv vb=vload(b); Tv vb=vload(b);
Tv res=vge(a.v[0],vb); Tm res=vge(a.v[0],vb);
for (int i=1; i<nvec; ++i) for (int i=1; i<nvec; ++i)
res=vand(res,vge(a.v[i],vb)); res=vand_mask(res,vge(a.v[i],vb));
return vallTrue(res); return vallTrue(res);
} }
static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac, static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
const double * restrict cf) const double * restrict cf)
{ {
Y(Tbu) sc, corf; Y(Tbu) sc, corf;
@ -220,7 +220,7 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
} }
} }
static void Y(iter_to_ieee_spin) (const Tb cth, int *l_, static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_, Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen) Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
{ {
@ -232,6 +232,11 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
cth2.v[i]=vmax(cth2.v[i],vload(1e-15)); cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5))); sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
sth2.v[i]=vmax(sth2.v[i],vload(1e-15)); sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
Tm mask=vlt(sth.v[i],vzero);
Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
vmuleq_mask(cmask,cth2.v[i],vload(-1.));
Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
vmuleq_mask(smask,sth2.v[i],vload(-1.));
} }
Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps; Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;

View File

@ -25,25 +25,17 @@
/*! \file sharp_core_inc2.c /*! \file sharp_core_inc2.c
* Type-dependent code for the computational core * Type-dependent code for the computational core
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
typedef struct static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
{ Y(Tbri) j[njobs]; } Z(Tbrij); Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
typedef union
{ Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij);
typedef struct
{ Y(Tbqu) j[njobs]; } Z(Tbquj);
typedef union
{ Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj);
static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax) int l, int lmax NJ1)
{
if (njobs>1)
{ {
#if (njobs>1)
while (l<lmax-2) while (l<lmax-2)
{ {
Tb lam_3, lam_4; Tb lam_3, lam_4;
@ -64,8 +56,8 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
ai4=vload(cimag(alm[njobs*(l+2)+j])); ai4=vload(cimag(alm[njobs*(l+2)+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaaeq(p1->j[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4); vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4); vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
} }
Tv ar3=vload(creal(alm[njobs*(l+1)+j])), Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
ai3=vload(cimag(alm[njobs*(l+1)+j])), ai3=vload(cimag(alm[njobs*(l+1)+j])),
@ -73,8 +65,8 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
ai1=vload(cimag(alm[njobs*(l+3)+j])); ai1=vload(cimag(alm[njobs*(l+3)+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaaeq(p2->j[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1); vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1); vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
} }
} }
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]); r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
@ -82,7 +74,7 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1)); lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
l+=4; l+=4;
} }
#endif }
while (l<lmax) while (l<lmax)
{ {
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]); Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
@ -94,15 +86,15 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
ai=vload(cimag(alm[njobs*l+j])); ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar); vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai); vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
} }
ar=vload(creal(alm[njobs*(l+1)+j])); ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j])); ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(p2->j[j].r.v[i],lam_1.v[i],ar); vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai); vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
} }
} }
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
@ -117,16 +109,17 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j])); Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar); vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai); vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
} }
} }
} }
} }
static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1, static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax) const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
NJ1)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -138,13 +131,13 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero; Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(tr1,lam_2.v[i],p1->j[j].r.v[i]); vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]); vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
} }
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(tr2,lam_1.v[i],p2->j[j].r.v[i]); vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]); vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
} }
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]); vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
} }
@ -160,8 +153,8 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
Tv tre=vzero, tim=vzero; Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(tre,lam_2.v[i],p1->j[j].r.v[i]); vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]); vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
} }
alm[l*njobs+j]+=vhsum_cmplx(tre,tim); alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
} }
@ -169,14 +162,14 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
} }
static void Z(calc_alm2map) (const Tb cth, const Tb sth, static void Z(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
Z(Tbrij) * restrict p2, int *done) Y(Tbri) * restrict p2 NJ1)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale; Tb lam_1,lam_2,scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec; job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; } if (l>lmax) return;
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
Tb corfac; Tb corfac;
@ -192,8 +185,8 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv tmp=vmul(lam_2.v[i],corfac.v[i]); Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(p1->j[j].r.v[i],tmp,ar); vfmaeq(p1[j].r.v[i],tmp,ar);
vfmaeq(p1->j[j].i.v[i],tmp,ai); vfmaeq(p1[j].i.v[i],tmp,ai);
} }
} }
if (++l>lmax) break; if (++l>lmax) break;
@ -206,8 +199,8 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv tmp=vmul(lam_1.v[i],corfac.v[i]); Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(p2->j[j].r.v[i],tmp,ar); vfmaeq(p2[j].r.v[i],tmp,ar);
vfmaeq(p2->j[j].i.v[i],tmp,ai); vfmaeq(p2[j].i.v[i],tmp,ai);
} }
} }
if (++l>lmax) break; if (++l>lmax) break;
@ -220,22 +213,22 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
full_ieee = Y(TballGe)(scale,sharp_minscale); full_ieee = Y(TballGe)(scale,sharp_minscale);
} }
} }
if (l>lmax) { *done=1; return; } if (l>lmax) return;
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
} }
static void Z(calc_map2alm) (const Tb cth, const Tb sth, static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Z(Tbrij) * restrict p2, int *done) const Y(Tbri) * restrict p2 NJ1)
{ {
int lmax=gen->lmax; int lmax=gen->lmax;
Tb lam_1,lam_2,scale; Tb lam_1,lam_2,scale;
int l=gen->m; int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec; job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; } if (l>lmax) return;
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
@ -251,12 +244,12 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv tmp=vmul(lam_2.v[i],corfac.v[i]); Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p1->j[j].r.v[i]); vfmaeq(tre,tmp,p1[j].r.v[i]);
vfmaeq(tim,tmp,p1->j[j].i.v[i]); vfmaeq(tim,tmp,p1[j].i.v[i]);
} }
alm[l*njobs+j]+=vhsum_cmplx(tre,tim); alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
} }
if (++l>lmax) { *done=1; return; } if (++l>lmax) return;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
@ -266,12 +259,12 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv tmp=vmul(lam_1.v[i],corfac.v[i]); Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p2->j[j].r.v[i]); vfmaeq(tre,tmp,p2[j].r.v[i]);
vfmaeq(tim,tmp,p2->j[j].i.v[i]); vfmaeq(tim,tmp,p2[j].i.v[i]);
} }
alm[l*njobs+j]+=vhsum_cmplx(tre,tim); alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
} }
if (++l>lmax) { *done=1; return; } if (++l>lmax) return;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
@ -283,11 +276,11 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
} }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
} }
static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py, static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm) const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
@ -296,25 +289,25 @@ static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lw=vadd(rxp.v[i],rxm.v[i]); Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px->j[j].qr.v[i],agr,lw); vfmaeq(px[j].qr.v[i],agr,lw);
vfmaeq(px->j[j].qi.v[i],agi,lw); vfmaeq(px[j].qi.v[i],agi,lw);
vfmaeq(px->j[j].ur.v[i],acr,lw); vfmaeq(px[j].ur.v[i],acr,lw);
vfmaeq(px->j[j].ui.v[i],aci,lw); vfmaeq(px[j].ui.v[i],aci,lw);
} }
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lx=vsub(rxm.v[i],rxp.v[i]); Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmseq(py->j[j].qr.v[i],aci,lx); vfmseq(py[j].qr.v[i],aci,lx);
vfmaeq(py->j[j].qi.v[i],acr,lx); vfmaeq(py[j].qi.v[i],acr,lx);
vfmaeq(py->j[j].ur.v[i],agi,lx); vfmaeq(py[j].ur.v[i],agi,lx);
vfmseq(py->j[j].ui.v[i],agr,lx); vfmseq(py[j].ui.v[i],agr,lx);
} }
} }
} }
static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2) const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
@ -326,26 +319,26 @@ static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2,
{ {
Tv lw1=vadd(r2p.v[i],r2m.v[i]); Tv lw1=vadd(r2p.v[i],r2m.v[i]);
Tv lx2=vsub(r1m.v[i],r1p.v[i]); Tv lx2=vsub(r1m.v[i],r1p.v[i]);
vfmaseq(p1->j[j].qr.v[i],agr1,lw1,aci2,lx2); vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2); vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2); vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2); vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
} }
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lx1=vsub(r2m.v[i],r2p.v[i]); Tv lx1=vsub(r2m.v[i],r2p.v[i]);
Tv lw2=vadd(r1p.v[i],r1m.v[i]); Tv lw2=vadd(r1p.v[i],r1m.v[i]);
vfmaseq(p2->j[j].qr.v[i],agr2,lw2,aci1,lx1); vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1); vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1); vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1); vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
} }
} }
} }
static inline void Z(saddstep2) (const Z(Tbquj) * restrict px, static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
const Z(Tbquj) * restrict py, const Tb * restrict rxp, const Y(Tbqu) * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm) const Tb * restrict rxm, dcmplx * restrict alm NJ1)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
@ -353,27 +346,27 @@ static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lw=vadd(rxp->v[i],rxm->v[i]); Tv lw=vadd(rxp->v[i],rxm->v[i]);
vfmaeq(agr,px->j[j].qr.v[i],lw); vfmaeq(agr,px[j].qr.v[i],lw);
vfmaeq(agi,px->j[j].qi.v[i],lw); vfmaeq(agi,px[j].qi.v[i],lw);
vfmaeq(acr,px->j[j].ur.v[i],lw); vfmaeq(acr,px[j].ur.v[i],lw);
vfmaeq(aci,px->j[j].ui.v[i],lw); vfmaeq(aci,px[j].ui.v[i],lw);
} }
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lx=vsub(rxm->v[i],rxp->v[i]); Tv lx=vsub(rxm->v[i],rxp->v[i]);
vfmseq(agr,py->j[j].ui.v[i],lx); vfmseq(agr,py[j].ui.v[i],lx);
vfmaeq(agi,py->j[j].ur.v[i],lx); vfmaeq(agi,py[j].ur.v[i],lx);
vfmaeq(acr,py->j[j].qi.v[i],lx); vfmaeq(acr,py[j].qi.v[i],lx);
vfmseq(aci,py->j[j].qr.v[i],lx); vfmseq(aci,py[j].qr.v[i],lx);
} }
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]); vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
} }
} }
static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1, static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax) int lmax NJ1)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -386,13 +379,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])), rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i])); vmul(fx2,rec1m.v[i]));
} }
#if (njobs>1)
Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)]); &alm[2*njobs*(l+1)] NJ2);
#else
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]);
#endif
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]); fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
@ -405,12 +393,13 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
l+=2; l+=2;
} }
if (l==lmax) if (l==lmax)
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]); Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
} }
static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1, static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax) const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
NJ1)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -423,8 +412,8 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])), rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i])); vmul(fx2,rec1m.v[i]));
} }
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]); Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)]); Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]); fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
@ -437,18 +426,19 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
l+=2; l+=2;
} }
if (l==lmax) if (l==lmax)
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]); Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
} }
static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done) const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2 NJ1)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); Y(iter_to_ieee_spin)
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec; job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) if (l>lmax) return;
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
@ -460,12 +450,12 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
&& Y(TballGe)(scalem,sharp_minscale); && Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Z(saddstep)(p1, p2, Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]); &alm[2*njobs*l] NJ2);
if (++l>lmax) break; if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Z(saddstep)(p2, p1, Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]); &alm[2*njobs*l] NJ2);
if (++l>lmax) break; if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -477,24 +467,24 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
} }
} }
if (l>lmax) if (l>lmax) return;
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(alm2map_spin_kernel) (cth,p1,p2, Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax); lmax NJ2);
} }
static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2, const sharp_Ylmgen_C * restrict gen, sharp_job *job,
int *done) const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); Y(iter_to_ieee_spin)
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec; job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) { *done=1; return; } if (l>lmax) return;
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
@ -507,12 +497,12 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
while (!full_ieee) while (!full_ieee)
{ {
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]); Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
if (++l>lmax) { *done=1; return; } if (++l>lmax) return;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]); Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
if (++l>lmax) { *done=1; return; } if (++l>lmax) return;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{ {
@ -525,12 +515,11 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
} }
static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm, const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
const dcmplx * restrict alm)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
@ -538,22 +527,22 @@ static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lw=vadd(rxp.v[i],rxm.v[i]); Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px->j[j].qr.v[i],ar,lw); vfmaeq(px[j].qr.v[i],ar,lw);
vfmaeq(px->j[j].qi.v[i],ai,lw); vfmaeq(px[j].qi.v[i],ai,lw);
} }
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lx=vsub(rxm.v[i],rxp.v[i]); Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmaeq(py->j[j].ur.v[i],ai,lx); vfmaeq(py[j].ur.v[i],ai,lx);
vfmseq(py->j[j].ui.v[i],ar,lx); vfmseq(py[j].ui.v[i],ar,lx);
} }
} }
} }
static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1, static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax) int lmax NJ1)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -566,8 +555,8 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])), rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i])); vmul(fx2,rec1m.v[i]));
} }
Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l]); Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)]); Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]); fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
@ -580,18 +569,19 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
l+=2; l+=2;
} }
if (l==lmax) if (l==lmax)
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]); Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
} }
static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done) const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2 NJ1)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); Y(iter_to_ieee_spin)
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec; job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) if (l>lmax) return;
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
@ -604,11 +594,11 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
while (!full_ieee) while (!full_ieee)
{ {
Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[njobs*l]); &alm[njobs*l] NJ2);
if (++l>lmax) break; if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[njobs*l]); &alm[njobs*l] NJ2);
if (++l>lmax) break; if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -620,20 +610,20 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
} }
} }
if (l>lmax) if (l>lmax) return;
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax); lmax NJ2);
} }
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Z(inner_loop) (sharp_job *job, const int *ispair, static void Z(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *idx) sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
{ {
const int nval=nvec*VLEN; const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi]; const int m = job->ainfo->mval[mi];
@ -646,35 +636,32 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
{ {
if (job->spin==0) if (job->spin==0)
{ {
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval) for (int ith=0; ith<ulim-llim; ith+=nval)
{ {
Z(Tburij) p1,p2; VZERO(p1); VZERO(p2); Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done) Y(Tbu) cth, sth;
{
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i) int skip=1;
{ for (int i=0; i<nval; ++i)
int itot=i+ith; {
if (itot>=ulim-llim) itot=ulim-llim-1; int itot=i+ith;
itot=idx[itot]; if (itot>=ulim-llim) itot=ulim-llim-1;
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; if (mlim[itot]>=m) skip=0;
} cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
} }
if (!skip)
Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
{ {
int itot=i+ith; int itot=i+ith;
if (itot<ulim-llim) if (itot<ulim-llim)
{ {
itot=idx[itot];
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi)); int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I, complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I; r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2; job->phase[phas_idx] = r1+r2;
if (ispair[itot]) if (ispair[itot])
job->phase[phas_idx+1] = r1-r2; job->phase[phas_idx+1] = r1-r2;
@ -685,39 +672,38 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
} }
else else
{ {
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval) for (int ith=0; ith<ulim-llim; ith+=nval)
{ {
Z(Tbuquj) p1,p2; VZERO(p1); VZERO(p2); Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done) Y(Tbu) cth, sth;
{ int skip=1;
Y(Tbu) cth;
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
{ {
int itot=i+ith; int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1; if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot]; if (mlim[itot]>=m) skip=0;
cth.s[i]=cth_[itot]; cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
(job->type==SHARP_ALM2MAP) ?
Z(calc_alm2map_spin ) (cth.b,gen,job,&p1.b,&p2.b,&done) :
Z(calc_alm2map_deriv1) (cth.b,gen,job,&p1.b,&p2.b,&done);
} }
if (!skip)
(job->type==SHARP_ALM2MAP) ?
Z(calc_alm2map_spin )
(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
Z(calc_alm2map_deriv1)
(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
{ {
int itot=i+ith; int itot=i+ith;
if (itot<ulim-llim) if (itot<ulim-llim)
{ {
itot=idx[itot];
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi)); int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I, complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I, q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I, u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I; u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
job->phase[phas_idx] = q1+q2; job->phase[phas_idx] = q1+q2;
job->phase[phas_idx+2] = u1+u2; job->phase[phas_idx+2] = u1+u2;
if (ispair[itot]) if (ispair[itot])
@ -740,70 +726,77 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
{ {
if (job->spin==0) if (job->spin==0)
{ {
int done=0; for (int ith=0; ith<ulim-llim; ith+=nval)
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{ {
Z(Tburij) p1, p2; VZERO(p1); VZERO(p2); Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth; Y(Tbu) cth, sth;
int skip=1;
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
{ {
int itot=i+ith; int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1; if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot]; if (mlim[itot]>=m) skip=0;
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
}
}
}
if (!skip)
Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
}
}
else
{
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
int skip=1;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
if (mlim[itot]>=m) skip=0;
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim) if (i+ith<ulim-llim)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi)); int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2);
p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2);
}
}
}
Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
}
}
else
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Z(Tbuquj) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx p1Q=job->phase[phas_idx], dcmplx p1Q=job->phase[phas_idx],
p1U=job->phase[phas_idx+2], p1U=job->phase[phas_idx+2],
p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1) if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; } { p2Q=-p2Q; p2U=-p2U; }
p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q); p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U); p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q); p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U); p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
} }
} }
} }
Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done); if (!skip)
Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
} }
} }
break; break;
} }
default:
{
UTIL_FAIL("must not happen");
break;
}
} }
} }

View File

@ -1,800 +0,0 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc3.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax, int njobs)
{
while (l<lmax-2)
{
Tb lam_3, lam_4;
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar2=vload(creal(alm[njobs*l+j])),
ai2=vload(cimag(alm[njobs*l+j])),
ar4=vload(creal(alm[njobs*(l+2)+j])),
ai4=vload(cimag(alm[njobs*(l+2)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
}
Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
ai3=vload(cimag(alm[njobs*(l+1)+j])),
ar1=vload(creal(alm[njobs*(l+3)+j])),
ai1=vload(cimag(alm[njobs*(l+3)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
}
}
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
l+=4;
}
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),
ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
}
ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
}
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
}
}
}
}
static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
}
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
}
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
}
}
static void Y(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, int njobs, int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(p1[j].r.v[i],tmp,ar);
vfmaeq(p1[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(p2[j].r.v[i],tmp,ar);
vfmaeq(p2[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
}
}
if (l>lmax) { *done=1; return; }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
}
static void Y(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, int njobs, int *done)
{
int lmax=gen->lmax;
Tb lam_1,lam_2,scale;
int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p1[j].r.v[i]);
vfmaeq(tim,tmp,p1[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p2[j].r.v[i]);
vfmaeq(tim,tmp,p2[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
}
}
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
}
static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px[j].qr.v[i],agr,lw);
vfmaeq(px[j].qi.v[i],agi,lw);
vfmaeq(px[j].ur.v[i],acr,lw);
vfmaeq(px[j].ui.v[i],aci,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmseq(py[j].qr.v[i],aci,lx);
vfmaeq(py[j].qi.v[i],acr,lx);
vfmaeq(py[j].ur.v[i],agi,lx);
vfmseq(py[j].ui.v[i],agr,lx);
}
}
}
static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw1=vadd(r2p.v[i],r2m.v[i]);
Tv lx2=vsub(r1m.v[i],r1p.v[i]);
vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
}
for (int i=0; i<nvec; ++i)
{
Tv lx1=vsub(r2m.v[i],r2p.v[i]);
Tv lw2=vadd(r1p.v[i],r1m.v[i]);
vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
}
}
}
static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
const Y(Tbqu) * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp->v[i],rxm->v[i]);
vfmaeq(agr,px[j].qr.v[i],lw);
vfmaeq(agi,px[j].qi.v[i],lw);
vfmaeq(acr,px[j].ur.v[i],lw);
vfmaeq(aci,px[j].ui.v[i],lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm->v[i],rxp->v[i]);
vfmseq(agr,py[j].ui.v[i],lx);
vfmaeq(agi,py[j].ur.v[i],lx);
vfmaeq(acr,py[j].qi.v[i],lx);
vfmseq(aci,py[j].qr.v[i],lx);
}
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
}
}
static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)], njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l], njobs);
}
static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l],njobs);
Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)],njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
}
static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[2*njobs*l],njobs);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[2*njobs*l], njobs);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax, njobs);
}
static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
int njobs, int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs);
}
static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px[j].qr.v[i],ar,lw);
vfmaeq(px[j].qi.v[i],ai,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmaeq(py[j].ur.v[i],ai,lx);
vfmseq(py[j].ui.v[i],ar,lx);
}
}
}
static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l],njobs);
Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)],njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
}
static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[njobs*l],njobs);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[njobs*l], njobs);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax, njobs);
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Y(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{
case SHARP_ALM2MAP:
case SHARP_ALM2MAP_DERIV1:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2;
if (ispair[itot])
job->phase[phas_idx+1] = r1-r2;
}
}
}
}
}
else
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
}
(job->type==SHARP_ALM2MAP) ?
Y(calc_alm2map_spin )
(cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done) :
Y(calc_alm2map_deriv1)
(cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
job->phase[phas_idx] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[itot])
{
dcmplx *phQ = &(job->phase[phas_idx+1]),
*phU = &(job->phase[phas_idx+3]);
*phQ = q1-q2;
*phU = u1-u2;
if ((gen->mhi-gen->m+gen->s)&1)
{ *phQ=-(*phQ); *phU=-(*phU); }
}
}
}
}
}
}
break;
}
case SHARP_MAP2ALM:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
}
}
}
Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
}
else
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx p1Q=job->phase[phas_idx],
p1U=job->phase[phas_idx+2],
p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; }
p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
}
}
}
Y(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
}
break;
}
}
}
#undef VZERO

View File

@ -1,11 +1,21 @@
#define Tb CONCAT2(Tb,nvec) #define Tb CONCAT2(Tb,nvec)
#define Y(arg) CONCAT2(arg,nvec) #define Y(arg) CONCAT2(arg,nvec)
#include "sharp_core_inc.c" #include "sharp_core_inc.c"
#if (MAXJOB_SPECIAL<6)
#include "sharp_core_inc3.c" #if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
#define NJ1 , int njobs
#define NJ2 , njobs
#define Z(arg) CONCAT2(arg,nvec)
#include "sharp_core_inc2.c"
#undef Z
#undef NJ1
#undef NJ2
#endif #endif
#if (MAXJOB_SPECIAL>=1) #define NJ1
#define NJ2
#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
#define njobs 1 #define njobs 1
#define Z(arg) CONCAT3(arg,nvec,njobs) #define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c" #include "sharp_core_inc2.c"
@ -13,7 +23,7 @@
#undef njobs #undef njobs
#endif #endif
#if (MAXJOB_SPECIAL>=2) #if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
#define njobs 2 #define njobs 2
#define Z(arg) CONCAT3(arg,nvec,njobs) #define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c" #include "sharp_core_inc2.c"
@ -21,7 +31,7 @@
#undef njobs #undef njobs
#endif #endif
#if (MAXJOB_SPECIAL>=3) #if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
#define njobs 3 #define njobs 3
#define Z(arg) CONCAT3(arg,nvec,njobs) #define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c" #include "sharp_core_inc2.c"
@ -29,7 +39,7 @@
#undef njobs #undef njobs
#endif #endif
#if (MAXJOB_SPECIAL>=4) #if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
#define njobs 4 #define njobs 4
#define Z(arg) CONCAT3(arg,nvec,njobs) #define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c" #include "sharp_core_inc2.c"
@ -37,7 +47,7 @@
#undef njobs #undef njobs
#endif #endif
#if (MAXJOB_SPECIAL>=5) #if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
#define njobs 5 #define njobs 5
#define Z(arg) CONCAT3(arg,nvec,njobs) #define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c" #include "sharp_core_inc2.c"
@ -45,7 +55,7 @@
#undef njobs #undef njobs
#endif #endif
#if (MAXJOB_SPECIAL>=6) #if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
#define njobs 6 #define njobs 6
#define Z(arg) CONCAT3(arg,nvec,njobs) #define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c" #include "sharp_core_inc2.c"
@ -53,5 +63,8 @@
#undef njobs #undef njobs
#endif #endif
#undef NJ1
#undef NJ2
#undef Y #undef Y
#undef Tb #undef Tb

View File

@ -25,7 +25,7 @@
/*! \file sharp_cxx.h /*! \file sharp_cxx.h
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2015 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -35,7 +35,6 @@
#include "sharp_lowlevel.h" #include "sharp_lowlevel.h"
#include "sharp_geomhelpers.h" #include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h" #include "sharp_almhelpers.h"
#include "xcomplex.h"
class sharp_base class sharp_base
{ {
@ -54,32 +53,50 @@ class sharp_base
void set_general_geometry (int nrings, const int *nph, const ptrdiff_t *ofs, void set_general_geometry (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta, const int *stride, const double *phi0, const double *theta,
const double *weight) const double *wgt)
{ {
sharp_make_geom_info (nrings, nph, ofs, stride, phi0, theta, weight, if (ginfo) sharp_destroy_geom_info(ginfo);
&ginfo); sharp_make_geom_info (nrings, nph, ofs, stride, phi0, theta, wgt, &ginfo);
} }
void set_ECP_geometry (int nrings, int nphi) void set_ECP_geometry (int nrings, int nphi)
{ sharp_make_ecp_geom_info (nrings, nphi, 0., 1, nphi, &ginfo); } {
if (ginfo) sharp_destroy_geom_info(ginfo);
sharp_make_ecp_geom_info (nrings, nphi, 0., 1, nphi, &ginfo);
}
void set_Gauss_geometry (int nrings, int nphi) void set_Gauss_geometry (int nrings, int nphi)
{ sharp_make_gauss_geom_info (nrings, nphi, 1, nphi, &ginfo); } {
if (ginfo) sharp_destroy_geom_info(ginfo);
sharp_make_gauss_geom_info (nrings, nphi, 0., 1, nphi, &ginfo);
}
void set_Healpix_geometry (int nside) void set_Healpix_geometry (int nside)
{ sharp_make_healpix_geom_info (nside, 1, &ginfo); } {
if (ginfo) sharp_destroy_geom_info(ginfo);
sharp_make_healpix_geom_info (nside, 1, &ginfo);
}
void set_weighted_Healpix_geometry (int nside, const double *weight) void set_weighted_Healpix_geometry (int nside, const double *weight)
{ sharp_make_weighted_healpix_geom_info (nside, 1, weight, &ginfo); } {
if (ginfo) sharp_destroy_geom_info(ginfo);
sharp_make_weighted_healpix_geom_info (nside, 1, weight, &ginfo);
}
void set_triangular_alm_info (int lmax, int mmax) void set_triangular_alm_info (int lmax, int mmax)
{ sharp_make_triangular_alm_info (lmax, mmax, 1, &ainfo); } {
if (ainfo) sharp_destroy_alm_info(ainfo);
sharp_make_triangular_alm_info (lmax, mmax, 1, &ainfo);
}
const sharp_geom_info* get_geom_info() const { return ginfo; }
const sharp_alm_info* get_alm_info() const { return ainfo; }
}; };
template<typename T> struct cxxjobhelper__ {}; template<typename T> struct cxxjobhelper__ {};
template<> struct cxxjobhelper__<double> template<> struct cxxjobhelper__<double>
{ enum {val=1}; }; { enum {val=SHARP_DP}; };
template<> struct cxxjobhelper__<float> template<> struct cxxjobhelper__<float>
{ enum {val=0}; }; { enum {val=0}; };
@ -88,52 +105,49 @@ template<> struct cxxjobhelper__<float>
template<typename T> class sharp_cxxjob: public sharp_base template<typename T> class sharp_cxxjob: public sharp_base
{ {
private: private:
static void *conv (xcomplex<T> *ptr)
{ return reinterpret_cast<void *>(ptr); }
static void *conv (const xcomplex<T> *ptr)
{ return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
static void *conv (T *ptr) static void *conv (T *ptr)
{ return reinterpret_cast<void *>(ptr); } { return reinterpret_cast<void *>(ptr); }
static void *conv (const T *ptr) static void *conv (const T *ptr)
{ return const_cast<void *>(reinterpret_cast<const void *>(ptr)); } { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
public: public:
void alm2map (const xcomplex<T> *alm, T *map, bool add) void alm2map (const T *alm, T *map, bool add)
{ {
void *aptr=conv(alm), *mptr=conv(map); void *aptr=conv(alm), *mptr=conv(map);
sharp_execute (SHARP_ALM2MAP, 0, add, &aptr, &mptr, ginfo, ainfo, 1, int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
cxxjobhelper__<T>::val,0,0,0); sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
flags,0,0);
} }
void alm2map_spin (const xcomplex<T> *alm1, const xcomplex<T> *alm2, void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2,
T *map1, T *map2, int spin, bool add) int spin, bool add)
{ {
void *aptr[2], *mptr[2]; void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2); aptr[0]=conv(alm1); aptr[1]=conv(alm2);
mptr[0]=conv(map1); mptr[1]=conv(map2); mptr[0]=conv(map1); mptr[1]=conv(map2);
sharp_execute (SHARP_ALM2MAP, spin, add, aptr, mptr, ginfo, ainfo, 1, int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
cxxjobhelper__<T>::val,0,0,0); sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
} }
void alm2map_der1 (const xcomplex<T> *alm, T *map1, T *map2, bool add) void alm2map_der1 (const T *alm, T *map1, T *map2, bool add)
{ {
void *aptr=conv(alm), *mptr[2]; void *aptr=conv(alm), *mptr[2];
mptr[0]=conv(map1); mptr[1]=conv(map2); mptr[0]=conv(map1); mptr[1]=conv(map2);
sharp_execute (SHARP_ALM2MAP_DERIV1, 1, add,&aptr, mptr, ginfo, ainfo, int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
1, cxxjobhelper__<T>::val,0,0,0); sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
} }
void map2alm (const T *map, xcomplex<T> *alm, bool add) void map2alm (const T *map, T *alm, bool add)
{ {
void *aptr=conv(alm), *mptr=conv(map); void *aptr=conv(alm), *mptr=conv(map);
sharp_execute (SHARP_MAP2ALM, 0, add, &aptr, &mptr, ginfo, ainfo, 1, int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
cxxjobhelper__<T>::val,0,0,0); sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
} }
void map2alm_spin (const T *map1, const T *map2, xcomplex<T> *alm1, void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
xcomplex<T> *alm2, int spin, bool add) int spin, bool add)
{ {
void *aptr[2], *mptr[2]; void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2); aptr[0]=conv(alm1); aptr[1]=conv(alm2);
mptr[0]=conv(map1); mptr[1]=conv(map2); mptr[0]=conv(map1); mptr[1]=conv(map2);
sharp_execute (SHARP_MAP2ALM, spin, add, aptr, mptr, ginfo, ainfo, 1, int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
cxxjobhelper__<T>::val,0,0,0); sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
} }
}; };

View File

@ -25,30 +25,24 @@
/*! \file sharp_geomhelpers.c /*! \file sharp_geomhelpers.c
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2006-2011 Max-Planck-Society * Copyright (C) 2006-2012 Max-Planck-Society<br>
* \author Martin Reinecke * Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl)
* \author Martin Reinecke \author Pavel Holoborodko
*/ */
#include <math.h> #include <math.h>
#include "sharp_geomhelpers.h" #include "sharp_geomhelpers.h"
#include "sharp_legendre_roots.h"
#include "c_utils.h" #include "c_utils.h"
#include "ls_fft.h"
#include <stdio.h>
void sharp_make_healpix_geom_info (int nside, int stride, void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
sharp_geom_info **geom_info) const int *rings, const double *weight, sharp_geom_info **geom_info)
{
double *weight=RALLOC(double,2*nside);
SET_ARRAY(weight,0,2*nside,1);
sharp_make_weighted_healpix_geom_info (nside, stride, weight, geom_info);
DEALLOC(weight);
}
void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info)
{ {
const double pi=3.141592653589793238462643383279502884197; const double pi=3.141592653589793238462643383279502884197;
ptrdiff_t npix=(ptrdiff_t)nside*nside*12; ptrdiff_t npix=(ptrdiff_t)nside*nside*12;
ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1); ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1);
int nrings=4*nside-1;
double *theta=RALLOC(double,nrings); double *theta=RALLOC(double,nrings);
double *weight_=RALLOC(double,nrings); double *weight_=RALLOC(double,nrings);
@ -56,9 +50,10 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
double *phi0=RALLOC(double,nrings); double *phi0=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings); ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings); int *stride_=RALLOC(int,nrings);
ptrdiff_t curofs=0, checkofs; /* checkofs used for assertion introduced when adding rings arg */
for (int m=0; m<nrings; ++m) for (int m=0; m<nrings; ++m)
{ {
int ring=m+1; int ring = (rings==NULL)? (m+1) : rings[m];
ptrdiff_t northring = (ring>2*nside) ? 4*nside-ring : ring; ptrdiff_t northring = (ring>2*nside) ? 4*nside-ring : ring;
stride_[m] = stride; stride_[m] = stride;
if (northring < nside) if (northring < nside)
@ -66,7 +61,7 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
theta[m] = 2*asin(northring/(sqrt(6.)*nside)); theta[m] = 2*asin(northring/(sqrt(6.)*nside));
nph[m] = 4*northring; nph[m] = 4*northring;
phi0[m] = pi/nph[m]; phi0[m] = pi/nph[m];
ofs[m] = 2*northring*(northring-1)*stride; checkofs = 2*northring*(northring-1)*stride;
} }
else else
{ {
@ -78,14 +73,21 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
phi0[m] = 0; phi0[m] = 0;
else else
phi0[m] = pi/nph[m]; phi0[m] = pi/nph[m];
ofs[m] = (ncap + (northring-nside)*nph[m])*stride; checkofs = (ncap + (northring-nside)*nph[m])*stride;
ofs[m] = curofs;
} }
if (northring != ring) /* southern hemisphere */ if (northring != ring) /* southern hemisphere */
{ {
theta[m] = pi-theta[m]; theta[m] = pi-theta[m];
ofs[m] = (npix - nph[m])*stride - ofs[m]; checkofs = (npix - nph[m])*stride - checkofs;
ofs[m] = curofs;
} }
weight_[m]=4.*pi/npix*weight[northring-1]; weight_[m]=4.*pi/npix*((weight==NULL) ? 1. : weight[northring-1]);
if (rings==NULL) {
UTIL_ASSERT(curofs==checkofs, "Bug in computing ofs[m]");
}
ofs[m] = curofs;
curofs+=nph[m];
} }
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_, sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_,
@ -99,93 +101,13 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
DEALLOC(stride_); DEALLOC(stride_);
} }
static void gauleg (double x1, double x2, double *x, double *w, int n) void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info)
{ {
const double pi = 3.141592653589793238462643383279502884197; sharp_make_subset_healpix_geom_info(nside, stride, 4 * nside - 1, NULL, weight, geom_info);
const double eps = 3.0E-14;
int m = (n+1)/2;
double xm = 0.5*(x2+x1);
double xl = 0.5*(x2-x1);
for(int i=1; i<=m; ++i)
{
double z = cos(pi*(i-0.25)/(n+0.5));
double pp;
int dobreak=0;
while(1)
{
double p1 = 1.0, p2 = 0.0;
double z1 = z;
int j;
for(j=1; j<=n; ++j)
{
double p3 = p2;
p2 = p1;
p1 = ((2*j-1)*z*p2-(j-1)*p3)/j;
}
pp = n*(z*p1-p2)/(z*z-1);
z = z1 - p1/pp;
if (dobreak) break;
if (fabs(z-z1) <= eps) dobreak=1;
}
x[i-1] = xm - xl*z;
x[n-i] = xm + xl*z;
w[i-1] = w[n-i] = 2*xl/((1-z*z)*pp*pp);
}
} }
static void makeweights (int bw, double *weights) void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
{
const double pi = 3.141592653589793238462643383279502884197;
const double fudge = pi/(4*bw);
for (int j=0; j<2*bw; ++j)
{
double tmpsum = 0;
for (int k=0; k<bw; ++k)
tmpsum += 1./(2*k+1) * sin((2*j+1)*(2*k+1)*fudge);
tmpsum *= sin((2*j+1)*fudge);
tmpsum *= 2./bw;
weights[j] = tmpsum;
/* weights[j + 2*bw] = tmpsum * sin((2*j+1)*fudge); */
}
}
void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
gauleg(-1,1,theta,weight,nrings);
for (int m=0; m<nrings; ++m)
{
theta[m] = acos(-theta[m]);
nph[m]=nphi;
phi0[m]=0;
ofs[m]=(ptrdiff_t)m*stride_lat;
stride_[m]=stride_lon;
weight[m]*=2*pi/nphi;
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0);
DEALLOC(ofs);
DEALLOC(stride_);
}
void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info) int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{ {
const double pi=3.141592653589793238462643383279502884197; const double pi=3.141592653589793238462643383279502884197;
@ -197,12 +119,10 @@ void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings); ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings); int *stride_=RALLOC(int,nrings);
UTIL_ASSERT((nrings&1)==0, sharp_legendre_roots(nrings,theta,weight);
"Even number of rings needed for equidistant grid!");
makeweights(nrings/2,weight);
for (int m=0; m<nrings; ++m) for (int m=0; m<nrings; ++m)
{ {
theta[m] = (m+0.5)*pi/nrings; theta[m] = acos(-theta[m]);
nph[m]=nphi; nph[m]=nphi;
phi0_[m]=phi0; phi0_[m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat; ofs[m]=(ptrdiff_t)m*stride_lat;
@ -220,3 +140,178 @@ void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
DEALLOC(ofs); DEALLOC(ofs);
DEALLOC(stride_); DEALLOC(stride_);
} }
/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0_=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
weight[0]=2.;
for (int k=1; k<=(nrings-1)/2; ++k)
{
weight[2*k-1]=2./(1.-4.*k*k)*cos((k*pi)/nrings);
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
}
if ((nrings&1)==0) weight[nrings-1]=0.;
real_plan plan = make_real_plan(nrings);
real_plan_backward_fftpack(plan,weight);
kill_real_plan(plan);
for (int m=0; m<(nrings+1)/2; ++m)
{
theta[m]=pi*(m+0.5)/nrings;
theta[nrings-1-m]=pi-theta[m];
nph[m]=nph[nrings-1-m]=ppring;
phi0_[m]=phi0_[nrings-1-m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat;
ofs[nrings-1-m]=(ptrdiff_t)((nrings-1-m)*stride_lat);
stride_[m]=stride_[nrings-1-m]=stride_lon;
weight[m]=weight[nrings-1-m]=weight[m]*2*pi/(nrings*nph[m]);
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0_);
DEALLOC(ofs);
DEALLOC(stride_);
}
/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0_=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
int n=nrings-1;
SET_ARRAY(weight,0,nrings,0.);
double dw=-1./(n*n-1.+(n&1));
weight[0]=2.+dw;
for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k) + dw;
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
real_plan plan = make_real_plan(n);
real_plan_backward_fftpack(plan,weight);
kill_real_plan(plan);
weight[n]=weight[0];
for (int m=0; m<(nrings+1)/2; ++m)
{
theta[m]=pi*m/(nrings-1.);
if (theta[m]<1e-15) theta[m]=1e-15;
theta[nrings-1-m]=pi-theta[m];
nph[m]=nph[nrings-1-m]=ppring;
phi0_[m]=phi0_[nrings-1-m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat;
ofs[nrings-1-m]=(ptrdiff_t)((nrings-1-m)*stride_lat);
stride_[m]=stride_[nrings-1-m]=stride_lon;
weight[m]=weight[nrings-1-m]=weight[m]*2*pi/(n*nph[m]);
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0_);
DEALLOC(ofs);
DEALLOC(stride_);
}
/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings+1);
int *nph=RALLOC(int,nrings);
double *phi0_=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
int n=nrings+1;
SET_ARRAY(weight,0,n,0.);
weight[0]=2.;
for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k);
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
real_plan plan = make_real_plan(n);
real_plan_backward_fftpack(plan,weight);
kill_real_plan(plan);
for (int m=0; m<nrings; ++m)
weight[m]=weight[m+1];
for (int m=0; m<(nrings+1)/2; ++m)
{
theta[m]=pi*(m+1)/(nrings+1.);
theta[nrings-1-m]=pi-theta[m];
nph[m]=nph[nrings-1-m]=ppring;
phi0_[m]=phi0_[nrings-1-m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat;
ofs[nrings-1-m]=(ptrdiff_t)((nrings-1-m)*stride_lat);
stride_[m]=stride_[nrings-1-m]=stride_lon;
weight[m]=weight[nrings-1-m]=weight[m]*2*pi/(n*nph[m]);
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0_);
DEALLOC(ofs);
DEALLOC(stride_);
}
void sharp_make_mw_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0_=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
for (int m=0; m<nrings; ++m)
{
theta[m]=pi*(2.*m+1.)/(2.*nrings-1.);
if (theta[m]>pi-1e-15) theta[m]=pi-1e-15;
nph[m]=ppring;
phi0_[m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat;
stride_[m]=stride_lon;
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, NULL,
geom_info);
DEALLOC(theta);
DEALLOC(nph);
DEALLOC(phi0_);
DEALLOC(ofs);
DEALLOC(stride_);
}

View File

@ -25,7 +25,7 @@
/*! \file sharp_geomhelpers.h /*! \file sharp_geomhelpers.h
* SHARP helper function for the creation of grid geometries * SHARP helper function for the creation of grid geometries
* *
* Copyright (C) 2006-2011 Max-Planck-Society * Copyright (C) 2006-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -39,26 +39,41 @@ extern "C" {
#endif #endif
/*! Creates a geometry information describing a HEALPix map with an /*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside. Nside parameter \a nside. \a weight contains the relative ring
weights and must have \a 2*nside entries. The rings array contains
the indices of the rings, with 1 being the first ring at the north
pole; if NULL then we take them to be sequential. Pass 4 * nside - 1
as nrings and NULL to rings to get the full HEALPix grid.
\note if \a weight is a null pointer, all weights are assumed to be 1.
\note if \a rings is a null pointer, take all rings
\ingroup geominfogroup */ \ingroup geominfogroup */
void sharp_make_healpix_geom_info (int nside, int stride, void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
sharp_geom_info **geom_info); const int *rings, const double *weight, sharp_geom_info **geom_info);
/*! Creates a geometry information describing a HEALPix map with an /*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside. \a weight contains the relative ring Nside parameter \a nside. \a weight contains the relative ring
weights and must have \a 2*nside entries. weights and must have \a 2*nside entries.
\note if \a weight is a null pointer, all weights are assumed to be 1.
\ingroup geominfogroup */ \ingroup geominfogroup */
void sharp_make_weighted_healpix_geom_info (int nside, int stride, void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info); const double *weight, sharp_geom_info **geom_info);
/*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside.
\ingroup geominfogroup */
static inline void sharp_make_healpix_geom_info (int nside, int stride,
sharp_geom_info **geom_info)
{ sharp_make_weighted_healpix_geom_info (nside, stride, NULL, geom_info); }
/*! Creates a geometry information describing a Gaussian map with \a nrings /*! Creates a geometry information describing a Gaussian map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is 0. The index difference between two adjacent pixels pixel in each ring is \a phi0 (in radians). The index difference between
in an iso-latitude ring is \a stride_lon, the index difference between the two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
two start pixels in consecutive iso-latitude rings is \a stride_lat. difference between the two start pixels in consecutive iso-latitude rings
is \a stride_lat.
\ingroup geominfogroup */ \ingroup geominfogroup */
void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon, void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
int stride_lat, sharp_geom_info **geom_info); int stride_lon, int stride_lat, sharp_geom_info **geom_info);
/*! Creates a geometry information describing an ECP map with \a nrings /*! Creates a geometry information describing an ECP map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
@ -68,11 +83,67 @@ void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
is \a stride_lat. is \a stride_lat.
\note The spacing of pixel centers is equidistant in colatitude and \note The spacing of pixel centers is equidistant in colatitude and
longitude. longitude.
\note \a nrings must be an even number.
\note The sphere is pixelized in a way that the colatitude of the first ring \note The sphere is pixelized in a way that the colatitude of the first ring
is \a 0.5*(pi/nrings). There are no pixel centers at the poles. is \a 0.5*(pi/nrings) and the colatitude of the last ring is
\a pi-0.5*(pi/nrings). There are no pixel centers at the poles.
\note This grid corresponds to Fejer's first rule.
\ingroup geominfogroup */ \ingroup geominfogroup */
void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0, void sharp_make_fejer1_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info);
/*! Old name for sharp_make_fejer1_geom_info()
\ingroup geominfogroup */
static inline void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
sharp_make_fejer1_geom_info (nrings, nphi, phi0, stride_lon, stride_lat,
geom_info);
}
/*! Creates a geometry information describing an ECP map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is \a phi0 (in radians). The index difference between
two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
difference between the two start pixels in consecutive iso-latitude rings
is \a stride_lat.
\note The spacing of pixel centers is equidistant in colatitude and
longitude.
\note The sphere is pixelized in a way that the colatitude of the first ring
is \a 0 and that of the last ring is \a pi.
\note This grid corresponds to Clenshaw-Curtis integration.
\ingroup geominfogroup */
void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info);
/*! Creates a geometry information describing an ECP map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is \a phi0 (in radians). The index difference between
two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
difference between the two start pixels in consecutive iso-latitude rings
is \a stride_lat.
\note The spacing of pixel centers is equidistant in colatitude and
longitude.
\note The sphere is pixelized in a way that the colatitude of the first ring
is \a pi/(nrings+1) and that of the last ring is \a pi-pi/(nrings+1).
\note This grid corresponds to Fejer's second rule.
\ingroup geominfogroup */
void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info);
/*! Creates a geometry information describing a map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is \a phi0 (in radians). The index difference between
two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
difference between the two start pixels in consecutive iso-latitude rings
is \a stride_lat.
\note The spacing of pixel centers is equidistant in colatitude and
longitude.
\note The sphere is pixelized in a way that the colatitude of the first ring
is \a pi/(2*nrings-1) and that of the last ring is \a pi.
\note This is the grid introduced by McEwen & Wiaux 2011.
\note This function does \e not define any quadrature weights.
\ingroup geominfogroup */
void sharp_make_mw_geom_info (int nrings, int ppring, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info); int stride_lon, int stride_lat, sharp_geom_info **geom_info);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -25,8 +25,8 @@
/*! \file sharp_internal.h /*! \file sharp_internal.h
* Internally used functionality for the spherical transform library. * Internally used functionality for the spherical transform library.
* *
* Copyright (C) 2006-2012 Max-Planck-Society * Copyright (C) 2006-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#ifndef PLANCK_SHARP_INTERNAL_H #ifndef PLANCK_SHARP_INTERNAL_H
@ -38,23 +38,22 @@
#include "sharp.h" #include "sharp.h"
typedef enum { FLOAT, DOUBLE } sharp_fde; #define SHARP_MAXTRANS 100
typedef struct typedef struct
{ {
sharp_jobtype type; sharp_jobtype type;
int spin; int spin;
int add_output;
int nmaps, nalm; int nmaps, nalm;
sharp_fde fde; int flags;
void **map; void **map;
void **alm; void **alm;
int s_m, s_th; // strides in m and theta direction
complex double *phase; complex double *phase;
double *norm_l; double *norm_l;
complex double *almtmp; complex double *almtmp;
const sharp_geom_info *ginfo; const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo; const sharp_alm_info *ainfo;
int nv;
double time; double time;
int ntrans; int ntrans;
unsigned long long opcnt; unsigned long long opcnt;
@ -62,5 +61,6 @@ typedef struct
int sharp_get_nv_max (void); int sharp_get_nv_max (void);
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans); int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
int sharp_get_mlim (int lmax, int spin, double sth, double cth);
#endif #endif

1319
external/sharp/libsharp/sharp_legendre.c vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,176 @@
/*
NOTE NOTE NOTE
This file is edited in sharp_legendre.c.in which is then preprocessed.
Do not make manual modifications to sharp_legendre.c.
NOTE NOTE NOTE
*/
/*
* This file is part of libsharp.
*
* Redistribution and use in source and binary forms, with or without
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file sharp_legendre.c.in
*
* Copyright (C) 2015 University of Oslo
* \author Dag Sverre Seljebotn
*/
#ifndef NO_LEGENDRE
#if (VLEN==8)
#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
/* ...or test it (it probably works) and remove this check */
#endif
#ifndef SHARP_LEGENDRE_CS
#define SHARP_LEGENDRE_CS 4
#endif
#define MAX_CS 6
#if (SHARP_LEGENDRE_CS > MAX_CS)
#error (SHARP_LEGENDRE_CS > MAX_CS)
#endif
#include "sharp_legendre.h"
#include "sharp_vecsupport.h"
#include <malloc.h>
/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
/*{ for cs in range(1, 7) }*/
static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
{{scalar}} xarr[({{cs}}) * VLEN{{T}}],
{{scalar}} out[({{cs}}) * VLEN{{T}}]) {
/*{ for i in range(cs) }*/
Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
/*{ endfor }*/
Tv{{T}} W1, W2, b, R;
ptrdiff_t l;
/*{ for i in range(cs) }*/
x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
Pm1_{{i}} = vload{{T}}(1.0);
P_{{i}} = x{{i}};
b = vload{{T}}(*bl);
y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
/*{ endfor }*/
b = vload{{T}}(*(bl + 1));
/*{ for i in range(cs) }*/
vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
/*{ endfor }*/
for (l = 2; l <= lmax; ++l) {
b = vload{{T}}(*(bl + l));
R = vload{{T}}(*(recfacs + l));
/*
P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
*/
/*{ for i in range(cs) }*/
Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
W2 = W1;
W2 = vsub{{T}}(W2, Pm2_{{i}});
P_{{i}} = W1;
vfmaeq{{T}}(P_{{i}}, W2, R);
vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
/*{ endfor }*/
}
/*{ for i in range(cs) }*/
vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
/*{ endfor }*/
}
/*{ endfor }*/
/*{ endfor }*/
/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
/* (l - 1) / l, for l >= 2 */
ptrdiff_t l;
r[0] = 0;
r[1] = 1;
for (l = 2; l <= lmax; ++l) {
r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
}
}
/*{ endfor }*/
/*
Compute sum_l b_l P_l(x_i) for all i.
*/
#define LEN (SHARP_LEGENDRE_CS * VLEN)
#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
void sharp_legendre_transform{{T}}({{scalar}} *bl,
{{scalar}} *recfac,
ptrdiff_t lmax,
{{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
{{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
int compute_recfac;
ptrdiff_t i, j, len;
compute_recfac = (recfac == NULL);
if (compute_recfac) {
recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
sharp_legendre_transform_recfac{{T}}(recfac, lmax);
}
for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
for (i = 0; i < nx; i += LEN{{T}}) {
len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 1:
case 0:
legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
}
for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
}
if (compute_recfac) {
free(recfac);
}
}
/*{ endfor }*/
#endif

View File

@ -0,0 +1,62 @@
/*
* This file is part of libsharp.
*
* Redistribution and use in source and binary forms, with or without
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file sharp_legendre.h
* Interface for the Legendre transform parts of the spherical transform library.
*
* Copyright (C) 2015 University of Oslo
* \author Dag Sverre Seljebotn
*/
#ifndef SHARP_LEGENDRE_H
#define SHARP_LEGENDRE_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef NO_LEGENDRE
void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
double *out, ptrdiff_t nx);
void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
float *out, ptrdiff_t nx);
void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,67 @@
/* Function adapted from GNU GSL file glfixed.c
Original author: Pavel Holoborodko (http://www.holoborodko.com)
Adjustments by M. Reinecke
- adjusted interface (keep epsilon internal, return full number of points)
- removed precomputed tables
- tweaked Newton iteration to obtain higher accuracy */
#include <math.h>
#include "sharp_legendre_roots.h"
#include "c_utils.h"
static inline double one_minus_x2 (double x)
{ return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
void sharp_legendre_roots(int n, double *x, double *w)
{
const double pi = 3.141592653589793238462643383279502884197;
const double eps = 3e-14;
int m = (n+1)>>1;
double t0 = 1 - (1-1./n) / (8.*n*n);
double t1 = 1./(4.*n+2.);
#pragma omp parallel
{
int i;
#pragma omp for schedule(dynamic,100)
for (i=1; i<=m; ++i)
{
double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
int dobreak=0;
int j=0;
double dpdx;
while(1)
{
double P_1 = 1.0;
double P0 = x0;
double dx, x1;
for (int k=2; k<=n; k++)
{
double P_2 = P_1;
P_1 = P0;
// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
}
dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
/* Newton step */
x1 = x0 - P0/dpdx;
dx = x0-x1;
x0 = x1;
if (dobreak) break;
if (fabs(dx)<=eps) dobreak=1;
UTIL_ASSERT(++j<100,"convergence problem");
}
x[i-1] = -x0;
x[n-i] = x0;
w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
}
} // end of parallel region
}

View File

@ -0,0 +1,50 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_legendre_roots.h
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef SHARP_LEGENDRE_ROOTS_H
#define SHARP_LEGENDRE_ROOTS_H
#ifdef __cplusplus
extern "C" {
#endif
/*! Computes roots and Gaussian quadrature weights for Legendre polynomial
of degree \a n.
\param n Order of Legendre polynomial
\param x Array of length \a n for output (root position)
\param w Array of length \a w for output (weight for Gaussian quadrature)
*/
void sharp_legendre_roots(int n, double *x, double *w);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -25,8 +25,8 @@
/*! \file sharp_lowlevel.h /*! \file sharp_lowlevel.h
* Low-level, portable interface for the spherical transform library. * Low-level, portable interface for the spherical transform library.
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#ifndef PLANCK_SHARP_LOWLEVEL_H #ifndef PLANCK_SHARP_LOWLEVEL_H
@ -60,7 +60,7 @@ typedef struct
typedef struct typedef struct
{ {
sharp_ringpair *pair; sharp_ringpair *pair;
int npairs; int npairs, nphmax;
} sharp_geom_info; } sharp_geom_info;
/*! \defgroup almgroup Helpers for dealing with a_lm */ /*! \defgroup almgroup Helpers for dealing with a_lm */
@ -76,6 +76,8 @@ typedef struct
int nm; int nm;
/*! Array with \a nm entries containing the individual m values */ /*! Array with \a nm entries containing the individual m values */
int *mval; int *mval;
/*! Combination of flags from sharp_almflags */
int flags;
/*! Array with \a nm entries containing the (hypothetical) indices of /*! Array with \a nm entries containing the (hypothetical) indices of
the coefficients with quantum numbers 0,\a mval[i] */ the coefficients with quantum numbers 0,\a mval[i] */
ptrdiff_t *mvstart; ptrdiff_t *mvstart;
@ -83,30 +85,59 @@ typedef struct
ptrdiff_t stride; ptrdiff_t stride;
} sharp_alm_info; } sharp_alm_info;
/*! Creates an Alm data structure information from the following parameters: /*! alm_info flags */
typedef enum { SHARP_PACKED = 1,
/*!< m=0-coefficients are packed so that the (zero) imaginary part is
not present. mvstart is in units of *real* float/double for all
m; stride is in units of reals for m=0 and complex for m!=0 */
SHARP_REAL_HARMONICS = 1<<6
/*!< Use the real spherical harmonic convention. For
m==0, the alm are treated exactly the same as in
the complex case. For m!=0, alm[i] represent a
pair (+abs(m), -abs(m)) instead of (real, imag),
and the coefficients are scaled by a factor of
sqrt(2) relative to the complex case. In other
words, (sqrt(.5) * alm[i]) recovers the
corresponding complex coefficient (when accessed
as complex).
*/
} sharp_almflags;
/*! Creates an a_lm data structure from the following parameters:
\param lmax maximum \a l quantum number (>=0) \param lmax maximum \a l quantum number (>=0)
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax) \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
\param stride the stride between consecutive a_lm entries \param stride the stride between entries with identical \a m, and \a l
differing by 1.
\param mstart the index of the (hypothetical) coefficient with the \param mstart the index of the (hypothetical) coefficient with the
quantum numbers 0,\a m. Must have \a mmax+1 entries. quantum numbers 0,\a m. Must have \a mmax+1 entries.
\param alm_info will hold a pointer to the newly created data structure \param alm_info will hold a pointer to the newly created data structure
*/ */
void sharp_make_alm_info (int lmax, int mmax, int stride, void sharp_make_alm_info (int lmax, int mmax, int stride,
const ptrdiff_t *mstart, sharp_alm_info **alm_info); const ptrdiff_t *mstart, sharp_alm_info **alm_info);
/*! Creates an Alm data structure information from the following parameters: /*! Creates an a_lm data structure which from the following parameters:
\param lmax maximum \a l quantum number (>=0) \param lmax maximum \a l quantum number (\a >=0)
\param nm number of different \a m (<=\a lmax+1) \param nm number of different \a m (\a 0<=nm<=lmax+1)
\param stride the stride between consecutive a_lm entries \param stride the stride between entries with identical \a m, and \a l
differing by 1.
\param mval array with \a nm entries containing the individual m values \param mval array with \a nm entries containing the individual m values
\param mvstart array with \a nm entries containing the (hypothetical) \param mvstart array with \a nm entries containing the (hypothetical)
indices of the coefficients with the quantum numbers 0,\a mval[i] indices of the coefficients with the quantum numbers 0,\a mval[i]
\param flags a combination of sharp_almflags (pass 0 unless you know you need this)
\param alm_info will hold a pointer to the newly created data structure \param alm_info will hold a pointer to the newly created data structure
*/ */
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
const ptrdiff_t *mvstart, sharp_alm_info **alm_info); const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
/*! Returns the index of the coefficient with quantum numbers \a l, /*! Returns the index of the coefficient with quantum numbers \a l,
\a mval[mi]. */ \a mval[mi].
\note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
the index for the coefficient with the quantum numbers \a l, \a mi. */
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi); ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
otherwise it is the number of complex coefficients (with m>=0). */
ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
/*! Deallocates the a_lm info object. */ /*! Deallocates the a_lm info object. */
void sharp_destroy_alm_info (sharp_alm_info *info); void sharp_destroy_alm_info (sharp_alm_info *info);
@ -123,12 +154,19 @@ void sharp_destroy_alm_info (sharp_alm_info *info);
\param stride the stride between consecutive pixels \param stride the stride between consecutive pixels
\param phi0 the azimuth (in radians) of the first pixel in each ring \param phi0 the azimuth (in radians) of the first pixel in each ring
\param theta the colatitude (in radians) of each ring \param theta the colatitude (in radians) of each ring
\param weight the pixel weight to be used for the ring \param wgt the pixel weight to be used for the ring in map2alm
and adjoint map2alm transforms.
Pass NULL to use 1.0 as weight for all rings.
\param geom_info will hold a pointer to the newly created data structure \param geom_info will hold a pointer to the newly created data structure
*/ */
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta, const int *stride, const double *phi0, const double *theta,
const double *weight, sharp_geom_info **geom_info); const double *wgt, sharp_geom_info **geom_info);
/*! Counts the number of grid points needed for (the local part of) a map described
by \a info.
*/
ptrdiff_t sharp_map_size(const sharp_geom_info *info);
/*! Deallocates the geometry information in \a info. */ /*! Deallocates the geometry information in \a info. */
void sharp_destroy_geom_info (sharp_geom_info *info); void sharp_destroy_geom_info (sharp_geom_info *info);
@ -139,45 +177,91 @@ void sharp_destroy_geom_info (sharp_geom_info *info);
/*! \{ */ /*! \{ */
/*! Enumeration of SHARP job types. */ /*! Enumeration of SHARP job types. */
typedef enum { SHARP_MAP2ALM, /*!< analysis */ typedef enum { SHARP_YtW=0, /*!< analysis */
SHARP_ALM2MAP, /*!< synthesis */ SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */
SHARP_ALM2MAP_DERIV1 /*!< synthesis of first derivatives */ SHARP_Y=1, /*!< synthesis */
SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */
SHARP_Yt=2, /*!< adjoint synthesis */
SHARP_WY=3, /*!< adjoint analysis */
SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */
} sharp_jobtype; } sharp_jobtype;
/*! Job flags */
typedef enum { SHARP_DP = 1<<4,
/*!< map and a_lm are in double precision */
SHARP_ADD = 1<<5,
/*!< results are added to the output arrays, instead of
overwriting them */
/* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
but its use here is deprecated in favor of having it in the sharp_alm_info */
SHARP_NO_FFT = 1<<7,
SHARP_USE_WEIGHTS = 1<<20, /* internal use only */
SHARP_NO_OPENMP = 1<<21, /* internal use only */
SHARP_NVMAX = (1<<4)-1 /* internal use only */
} sharp_jobflags;
/*! Performs a libsharp SHT job. The interface deliberately does not use /*! Performs a libsharp SHT job. The interface deliberately does not use
the C99 "complex" data type, in order to be callable from C. the C99 "complex" data type, in order to be callable from C89 and C++.
\param type the type of SHT \param type the type of SHT
\param spin the spin of the quantities to be transformed \param spin the spin of the quantities to be transformed
\param add_output if 0, the output arrays will be overwritten,
else the result will be added to the output arrays.
\param alm contains pointers to the a_lm coefficients. If \a spin==0, \param alm contains pointers to the a_lm coefficients. If \a spin==0,
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT, etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
depends on the \a dp parameter. depends on whether the SHARP_DP flag is set.
\param map contains pointers to the maps. If \a spin==0, \param map contains pointers to the maps. If \a spin==0,
map[0] points to the map of the first SHT, map[1] to that of the second map[0] points to the map of the first SHT, map[1] to that of the second
etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT, etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
map[2] and map[3] to those of the second, etc. The exact data type of \a map point to the maps of the first SHT, map[2] and map[3] to those of the
depends on the \a dp parameter. second, etc. The exact data type of \a map depends on whether the SHARP_DP
flag is set.
\param geom_info A \c sharp_geom_info object compatible with the provided \param geom_info A \c sharp_geom_info object compatible with the provided
\a map arrays. \a map arrays.
\param alm_info A \c sharp_alm_info object compatible with the provided \param alm_info A \c sharp_alm_info object compatible with the provided
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
exactly once. exactly once.
\param ntrans the number of simultaneous SHTs \param ntrans the number of simultaneous SHTs
\param dp if 0, the \a alm is expected to have the type "complex float **" \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
and \a map is expected to have the type "float **"; otherwise the expected \a alm is expected to have the type "complex double **" and \a map is
types are "complex double **" and "double **", respectively. expected to have the type "double **"; otherwise, the expected
\param nv Internally used SHT parameter. Set to 0 unless you know what you are types are "complex float **" and "float **", respectively.
doing.
\param time If not NULL, the wall clock time required for this SHT \param time If not NULL, the wall clock time required for this SHT
(in seconds)will be written here. (in seconds) will be written here.
\param opcnt If not NULL, a conservative estimate of the total floating point \param opcnt If not NULL, a conservative estimate of the total floating point
operation count for this SHT will be written here. */ operation count for this SHT will be written here. */
void sharp_execute (sharp_jobtype type, int spin, int add_output, void *alm, void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
int ntrans, int dp, int nv, double *time, unsigned long long *opcnt); int flags, double *time, unsigned long long *opcnt);
void sharp_set_chunksize_min(int new_chunksize_min);
void sharp_set_nchunks_max(int new_nchunks_max);
typedef enum { SHARP_ERROR_NO_MPI = 1,
/*!< libsharp not compiled with MPI support */
} sharp_errors;
/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
is compiled with USE_MPI. This is primarily useful for wrapper code etc.
Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
is *(MPI_Comm*)pcomm.
Other parameters are the same as sharp_execute_mpi.
Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
(in which case nothing is done).
*/
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
unsigned long long *opcnt);
/*! \} */ /*! \} */

View File

@ -25,8 +25,8 @@
/*! \file sharp_mpi.c /*! \file sharp_mpi.c
* Functionality only needed for MPI-parallel transforms * Functionality only needed for MPI-parallel transforms
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#ifdef USE_MPI #ifdef USE_MPI
@ -185,116 +185,161 @@ static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ? ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
(ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull; (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size); job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
job->s_m=2*job->ntrans*job->nmaps;
job->s_th = job->s_m * ((job->type==SHARP_MAP2ALM) ? nmfull : nm);
} }
static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo) static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo)
{ {
if (job->type != SHARP_MAP2ALM) if (job->type != SHARP_MAP2ALM)
{
sharp_communicate_alm2map (minfo,&job->phase); sharp_communicate_alm2map (minfo,&job->phase);
job->s_th=job->s_m*minfo->nmtotal;
}
} }
static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo) static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo)
{ {
if (job->type == SHARP_MAP2ALM) if (job->type == SHARP_MAP2ALM)
{
sharp_communicate_map2alm (minfo,&job->phase); sharp_communicate_map2alm (minfo,&job->phase);
job->s_th=job->s_m*minfo->nm[minfo->mytask];
}
} }
static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm) static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
{ {
double timer=wallTime();
int ntasks; int ntasks;
MPI_Comm_size(comm, &ntasks); MPI_Comm_size(comm, &ntasks);
if (ntasks==1) /* fall back to scalar implementation */ if (ntasks==1) /* fall back to scalar implementation */
{ sharp_execute_job (job); return; } { sharp_execute_job (job); return; }
int lmax = job->ainfo->lmax; MPI_Barrier(comm);
double timer=wallTime();
job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin); job->opcnt=0;
sharp_mpi_info minfo; sharp_mpi_info minfo;
sharp_make_mpi_info(comm, job, &minfo); sharp_make_mpi_info(comm, job, &minfo);
/* clear output arrays if requested */ if (minfo.npairtotal>minfo.ntasks*300)
init_output (job);
alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
minfo.npairtotal);
double *cth = RALLOC(double,minfo.npairtotal),
*sth = RALLOC(double,minfo.npairtotal);
idxhelper *stmp = RALLOC(idxhelper,minfo.npairtotal);
for (int i=0; i<minfo.npairtotal; ++i)
{ {
cth[i] = cos(minfo.theta[i]); int nsub=(minfo.npairtotal+minfo.ntasks*200-1)/(minfo.ntasks*200);
sth[i] = sin(minfo.theta[i]); for (int isub=0; isub<nsub; ++isub)
stmp[i].s=sth[i]; {
stmp[i].i=i; sharp_job ljob=*job;
// When creating a_lm, every sub-job produces a complete set of
// coefficients; they need to be added up.
if ((isub>0)&&(job->type==SHARP_MAP2ALM)) ljob.flags|=SHARP_ADD;
sharp_geom_info lginfo;
lginfo.pair=RALLOC(sharp_ringpair,(job->ginfo->npairs/nsub)+1);
lginfo.npairs=0;
lginfo.nphmax = job->ginfo->nphmax;
while (lginfo.npairs*nsub+isub<job->ginfo->npairs)
{
lginfo.pair[lginfo.npairs]=job->ginfo->pair[lginfo.npairs*nsub+isub];
++lginfo.npairs;
}
ljob.ginfo=&lginfo;
sharp_execute_job_mpi (&ljob,comm);
job->opcnt+=ljob.opcnt;
DEALLOC(lginfo.pair);
}
} }
qsort (stmp,minfo.npairtotal,sizeof(idxhelper),idx_compare); else
int *idx = RALLOC(int,minfo.npairtotal); {
for (int i=0; i<minfo.npairtotal; ++i) int lmax = job->ainfo->lmax;
idx[i]=stmp[i].i; job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);
DEALLOC(stmp);
/* map->phase where necessary */ /* clear output arrays if requested */
map2phase (job, minfo.mmax, 0, job->ginfo->npairs); init_output (job);
map2alm_comm (job, &minfo); alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
minfo.npairtotal);
#pragma omp parallel double *cth = RALLOC(double,minfo.npairtotal),
*sth = RALLOC(double,minfo.npairtotal);
int *mlim = RALLOC(int,minfo.npairtotal);
for (int i=0; i<minfo.npairtotal; ++i)
{
cth[i] = cos(minfo.theta[i]);
sth[i] = sin(minfo.theta[i]);
mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i]);
}
/* map->phase where necessary */
map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
map2alm_comm (job, &minfo);
#pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
{ {
sharp_job ljob = *job; sharp_job ljob = *job;
sharp_Ylmgen_C generator; sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin); sharp_Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
alloc_almtmp(&ljob,lmax); alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1) #pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi) for (int mi=0; mi<job->ainfo->nm; ++mi)
{ {
/* alm->alm_tmp where necessary */ /* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi); alm2almtmp (&ljob, lmax, mi);
/* inner conversion loop */ /* inner conversion loop */
inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal, inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
&generator, mi, idx); &generator, mi, mlim);
/* alm_tmp->alm where necessary */ /* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi); almtmp2alm (&ljob, lmax, mi);
} }
sharp_Ylmgen_destroy(&generator); sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob); dealloc_almtmp(&ljob);
#pragma omp critical #pragma omp critical
job->opcnt+=ljob.opcnt; job->opcnt+=ljob.opcnt;
} /* end of parallel region */ } /* end of parallel region */
alm2map_comm (job, &minfo); alm2map_comm (job, &minfo);
/* phase->map where necessary */ /* phase->map where necessary */
phase2map (job, minfo.mmax, 0, job->ginfo->npairs); phase2map (job, minfo.mmax, 0, job->ginfo->npairs);
DEALLOC(cth); DEALLOC(mlim);
DEALLOC(sth); DEALLOC(cth);
DEALLOC(idx); DEALLOC(sth);
DEALLOC(job->norm_l); DEALLOC(job->norm_l);
dealloc_phase (job); dealloc_phase (job);
}
sharp_destroy_mpi_info(&minfo); sharp_destroy_mpi_info(&minfo);
job->time=wallTime()-timer; job->time=wallTime()-timer;
} }
void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin, void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
int add_output, void *alm, void *map, const sharp_geom_info *geom_info, void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int dp, int nv, double *time, const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
unsigned long long *opcnt) unsigned long long *opcnt)
{ {
sharp_job job; sharp_job job;
sharp_build_job_common (&job, type, spin, add_output, alm, map, geom_info, sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
alm_info, ntrans, dp, nv); ntrans, flags);
sharp_execute_job_mpi (&job, comm); sharp_execute_job_mpi (&job, comm);
if (time!=NULL) *time = job.time; if (time!=NULL) *time = job.time;
if (opcnt!=NULL) *opcnt = job.opcnt; if (opcnt!=NULL) *opcnt = job.opcnt;
} }
/* We declare this only in C file to make symbol available for Fortran wrappers;
without declaring it in C header as it should not be available to C code */
void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
unsigned long long *opcnt);
void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
unsigned long long *opcnt)
{
sharp_execute_mpi(MPI_Comm_f2c(comm), type, spin, alm, map, geom_info,
alm_info, ntrans, flags, time, opcnt);
}
#endif #endif

View File

@ -26,14 +26,14 @@
* Interface for the spherical transform library with MPI support. * Interface for the spherical transform library with MPI support.
* *
* Copyright (C) 2011,2012 Max-Planck-Society * Copyright (C) 2011,2012 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#ifndef PLANCK_SHARP_MPI_H #ifndef PLANCK_SHARP_MPI_H
#define PLANCK_SHARP_MPI_H #define PLANCK_SHARP_MPI_H
#include <mpi.h> #include <mpi.h>
#include "sharp.h" #include "sharp_lowlevel.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -44,18 +44,17 @@ extern "C" {
\param comm the MPI communicator to be used for this SHT \param comm the MPI communicator to be used for this SHT
\param type the type of SHT \param type the type of SHT
\param spin the spin of the quantities to be transformed \param spin the spin of the quantities to be transformed
\param add_output if 0, the output arrays will be overwritten,
else the result will be added to the output arrays.
\param alm contains pointers to the a_lm coefficients. If \a spin==0, \param alm contains pointers to the a_lm coefficients. If \a spin==0,
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT, etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
depends on the \a dp parameter. depends on whether the SHARP_DP flag is set.
\param map contains pointers to the maps. If \a spin==0, \param map contains pointers to the maps. If \a spin==0,
map[0] points to the map of the first SHT, map[1] to that of the second map[0] points to the map of the first SHT, map[1] to that of the second
etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT, etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
map[2] and map[3] to those of the second, etc. The exact data type of \a map point to the maps of the first SHT, map[2] and map[3] to those of the
depends on the \a dp parameter. second, etc. The exact data type of \a map depends on whether the SHARP_DP
flag is set.
\param geom_info A \c sharp_geom_info object compatible with the provided \param geom_info A \c sharp_geom_info object compatible with the provided
\a map arrays. The total map geometry is the union of all \a geom_info \a map arrays. The total map geometry is the union of all \a geom_info
objects over the participating MPI tasks. objects over the participating MPI tasks.
@ -64,18 +63,17 @@ extern "C" {
exactly once in the union of all \a alm_info objects over the participating exactly once in the union of all \a alm_info objects over the participating
MPI tasks. MPI tasks.
\param ntrans the number of simultaneous SHTs \param ntrans the number of simultaneous SHTs
\param dp if 0, the \a alm is expected to have the type "complex float **" \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
and \a map is expected to have the type "float **"; otherwise the expected \a alm is expected to have the type "complex double **" and \a map is
types are "complex double **" and "double **", respectively. expected to have the type "double **"; otherwise, the expected
\param nv Internally used SHT parameter. Set to 0 unless you know what you are types are "complex float **" and "float **", respectively.
doing.
\param time If not NULL, the wall clock time required for this SHT \param time If not NULL, the wall clock time required for this SHT
(in seconds)will be written here. (in seconds) will be written here.
\param opcnt If not NULL, a conservative estimate of the total floating point \param opcnt If not NULL, a conservative estimate of the total floating point
operation count for this SHT will be written here. */ operation count for this SHT will be written here. */
void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin, void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
int add_output, void *alm, void *map, const sharp_geom_info *geom_info, void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int dp, int nv, double *time, const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
unsigned long long *opcnt); unsigned long long *opcnt);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -1,249 +0,0 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_test.c
Accuracy test for libsharp's map analysis.
This program first generates a_lm coefficients up to
a user-specified lmax (with mmax=lmax); where applicable, the
real and imaginary parts of the coefficients are uniform
random numbers of the interval [-1;1[.
Afterwards, the random a_lm are converted to a map.
This map is analyzed (optionally using an iterative scheme
with a user-supplied number of steps).
After every iteration, the code then outputs the RMS of the residual a_lm
(i.e. the difference between the current and original a_lm), divided by
the RMS of the original a_lm, as well as the maximum absolute change of any
real or imaginary part between the current and original a_lm.
This operation can be performed for several different pixelisations:
- a Gaussian with the minimal number of rings for exact analysis
and a user-defined ring resolution
- an ECP grid with the minimal number of rings for exact analysis
and a user-defined ring resolution
- a Healpix grid with a user-defined Nside parameter.
The user can specify the spin of the desired transform.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
#include "memusage.h"
typedef complex double dcmplx;
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
ptrdiff_t nalms, int ncomp)
{
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0;
for (ptrdiff_t m=0; m<nalms; ++m)
{
double x=creal(alm[i][m])-creal(alm2[i][m]),
y=cimag(alm[i][m])-cimag(alm2[i][m]);
sum+=x*x+y*y;
sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
sum=sqrt(sum/nalms);
sum2=sqrt(sum2/nalms);
printf("component %i: rms %e, maxerr %e\n",i, sum/sum2, maxdiff);
}
}
static void map2alm_iter (sharp_geom_info *tinfo, double **map,
dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
ptrdiff_t npix, ptrdiff_t nalms, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
double time;
unsigned long long opcnt;
sharp_execute(SHARP_MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for map2alm: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
measure_errors(alm_orig,alm,nalms,ncomp);
for (int iter=0; iter<niter; ++iter)
{
double **map2;
ALLOC2D(map2,double,ncomp,npix);
printf ("\niteration %i:\n", iter+1);
sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for alm2map: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
for (int i=0; i<ncomp; ++i)
for (ptrdiff_t m=0; m<npix; ++m)
map2[i][m] = map[i][m]-map2[i][m];
sharp_execute(SHARP_MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for map2alm: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
DEALLOC2D(map2);
measure_errors(alm_orig,alm,nalms,ncomp);
}
sharp_destroy_alm_info(alms);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
{
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
srand(4);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
double time;
unsigned long long opcnt;
printf ("\niteration 0:\n");
sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for alm2map: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
map2alm_iter(tinfo,map,alm,alm2,lmax,mmax,npix,nalms,spin,ntrans,niter);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(int argc, char **argv)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
sharp_module_startup("sharp_test",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",1);
int lmax=atoi(argv[2]);
int niter=atoi(argv[4]);
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
printf("Testing map analysis accuracy.\n");
printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
sharp_geom_info *tinfo;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=lmax+1;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=2*lmax+2;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
nside,(long)npix);
sharp_make_healpix_geom_info (nside, 1, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
printf("\nMemory high water mark: %.2f MB\n",VmHWM()/(1<<20));
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

View File

@ -1,359 +0,0 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_test_mpi.c
Accuracy test for libsharp's map analysis with MPI support.
This program first generates a_lm coefficients up to
a user-specified lmax (with mmax=lmax); where applicable, the
real and imaginary parts of the coefficients are uniform
random numbers of the interval [-1;1[.
Afterwards, the random a_lm are converted to a map.
This map is analyzed (optionally using an iterative scheme
with a user-supplied number of steps).
After every iteration, the code then outputs the RMS of the residual a_lm
(i.e. the difference between the current and original a_lm), divided by
the RMS of the original a_lm, as well as the maximum absolute change of any
real or imaginary part between the current and original a_lm.
This operation can be performed for several different pixelisations:
- a Gaussian with the minimal number of rings for exact analysis
and a user-defined ring resolution
- an ECP grid with the minimal number of rings for exact analysis
and a user-defined ring resolution
- a Healpix grid with a user-defined Nside parameter.
The user can specify the spin of the desired transform.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#ifdef USE_MPI
#include <stdio.h>
#include <string.h>
#include "sharp_mpi.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "walltime_c.h"
#include "sharp_announce.h"
#include "sharp_core.h"
typedef complex double dcmplx;
int ntasks, mytask;
static unsigned long long totalops (unsigned long long val)
{
unsigned long long tmp;
MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static double maxTime (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return tmp;
}
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
{
ptrdiff_t res=0;
for (int i=0; i<ainfo->nm; ++i)
res += ainfo->lmax-ainfo->mval[i]+1;
return res;
}
static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
{
ptrdiff_t res=0;
for (int i=0; i<ginfo->npairs; ++i)
{
res += ginfo->pair[i].r1.nph;
if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
}
return res;
}
static void reduce_alm_info(sharp_alm_info *ainfo)
{
int nmnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
{
ainfo->mval[nmnew]=ainfo->mval[i];
ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
}
ainfo->nm=nmnew;
}
static void reduce_geom_info(sharp_geom_info *ginfo)
{
int npairsnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
{
ginfo->pair[npairsnew]=ginfo->pair[i];
ginfo->pair[npairsnew].r1.ofs=ofs;
ofs+=ginfo->pair[npairsnew].r1.nph;
ginfo->pair[npairsnew].r2.ofs=ofs;
if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
}
ginfo->npairs=npairsnew;
}
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
static int cnt=0;
++cnt;
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
srand(1234567*cnt+8912*m);
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
const sharp_alm_info *ainfo, int ncomp)
{
long nalms=get_nalms(ainfo), nalms_tot;
MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0, sumtot, sum2tot, maxdifftot;
for (int mi=0; mi<ainfo->nm; ++mi)
{
int m=ainfo->mval[mi];
for (int l=m; l<=ainfo->lmax; ++l)
{
ptrdiff_t idx=sharp_alm_index(ainfo,l,mi);
double x=creal(alm[i][idx])-creal(alm2[i][idx]),
y=cimag(alm[i][idx])-cimag(alm2[i][idx]);
sum+=x*x+y*y;
sum2+=creal(alm[i][idx])*creal(alm[i][idx])
+cimag(alm[i][idx])*cimag(alm[i][idx]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
}
MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&sum2,&sum2tot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
sumtot=sqrt(sumtot/nalms_tot);
sum2tot=sqrt(sum2tot/nalms_tot);
if (mytask==0)
printf("component %i: rms %e, maxerr %e\n",i, sumtot/sum2tot, maxdifftot);
}
}
static void map2alm_iter (sharp_geom_info *tinfo, double **map,
dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
ptrdiff_t npix, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
double jtime;
unsigned long long jopcnt;
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,0,&alm[0],&map[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
unsigned long long opcnt=totalops(jopcnt);
double timer=maxTime(jtime);
if (mytask==0) printf("wall time for map2alm: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
measure_errors(alm_orig,alm,alms,ncomp);
for (int iter=0; iter<niter; ++iter)
{
double **map2;
ALLOC2D(map2,double,ncomp,npix);
if (mytask==0) printf ("\niteration %i:\n", iter+1);
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,0,&alm[0],&map2[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
opcnt=totalops(jopcnt);
timer=maxTime(jtime);
if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
for (int i=0; i<ncomp; ++i)
for (ptrdiff_t m=0; m<npix; ++m)
map2[i][m] = map[i][m]-map2[i][m];
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,1,&alm[0],&map2[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
opcnt=totalops(jopcnt);
timer=maxTime(jtime);
if (mytask==0) printf("wall time for map2alm: %fs\n",wallTime()-timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
DEALLOC2D(map2);
measure_errors(alm_orig,alm,alms,ncomp);
}
sharp_destroy_alm_info(alms);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
double jtime;
unsigned long long jopcnt;
sharp_alm_info *alms;
ptrdiff_t nalms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
nalms=get_nalms(alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
srand(4);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
if (mytask==0) printf ("\niteration 0:\n");
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,0,&alm[0],&map[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
unsigned long long opcnt=totalops(jopcnt);
double timer=maxTime(jtime);
if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
map2alm_iter(tinfo, map, alm, alm2, lmax, mmax, npix, spin, ntrans, niter);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(int argc, char **argv)
{
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
sharp_module_startup("sharp_test_mpi",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",
mytask==0);
int lmax=atoi(argv[2]);
int niter=atoi(argv[4]);
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
if (mytask==0)
{
printf("Testing map analysis accuracy.\n");
printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
}
sharp_geom_info *tinfo;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=lmax+1;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
if (mytask==0)
printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=2*lmax+2;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
if (mytask==0)
printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
if (mytask==0)
printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
nside,(long)npix);
sharp_make_healpix_geom_info (nside, 1, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
MPI_Finalize();
return 0;
}
#else
#include "c_utils.h"
int main(void)
{ UTIL_FAIL("MPI support not enabled."); return 1; }
#endif

View File

@ -0,0 +1,708 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/* \file sharp_testsuite.c
*
* Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#include "sharp_mpi.h"
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#include "sharp.h"
#include "sharp_internal.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "memusage.h"
#include "sharp_vecsupport.h"
typedef complex double dcmplx;
int ntasks, mytask;
static double drand (double min, double max, int *state)
{
*state = (((*state) * 1103515245) + 12345) & 0x7fffffff;
return min + (max-min)*(*state)/(0x7fffffff+1.0);
}
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{
#pragma omp parallel
{
int mi;
#pragma omp for schedule (dynamic,100)
for (mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
int state=1234567*cnt+8912*m; // random seed
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
} // end of parallel region
}
static unsigned long long totalops (unsigned long long val)
{
#ifdef USE_MPI
unsigned long long tmp;
MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
return tmp;
#else
return val;
#endif
}
static double maxTime (double val)
{
#ifdef USE_MPI
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return tmp;
#else
return val;
#endif
}
static double allreduceSumDouble (double val)
{
#ifdef USE_MPI
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
return tmp;
#else
return val;
#endif
}
static double totalMem()
{
#ifdef USE_MPI
double tmp, val=VmHWM();
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
return tmp;
#else
return VmHWM();
#endif
}
#ifdef USE_MPI
static void reduce_alm_info(sharp_alm_info *ainfo)
{
int nmnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
{
ainfo->mval[nmnew]=ainfo->mval[i];
ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
}
ainfo->nm=nmnew;
}
static void reduce_geom_info(sharp_geom_info *ginfo)
{
int npairsnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
{
ginfo->pair[npairsnew]=ginfo->pair[i];
ginfo->pair[npairsnew].r1.ofs=ofs;
ofs+=ginfo->pair[npairsnew].r1.nph;
ginfo->pair[npairsnew].r2.ofs=ofs;
if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
}
ginfo->npairs=npairsnew;
}
#endif
static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
{
ptrdiff_t res=0;
for (int i=0; i<ainfo->nm; ++i)
res += ainfo->lmax-ainfo->mval[i]+1;
return res;
}
static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
{
ptrdiff_t res=0;
for (int i=0; i<ginfo->npairs; ++i)
{
res += ginfo->pair[i].r1.nph;
if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
}
return res;
}
static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
{
double *sqsum=RALLOC(double,ncomp);
for (int i=0; i<ncomp; ++i)
{
sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j)
{
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
alm[i][j]=-alm[i][j];
}
}
return sqsum;
}
static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double **err_abs, double **err_rel)
{
long nalms_tot=nalms;
#ifdef USE_MPI
MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
#endif
*err_abs=RALLOC(double,ncomp);
*err_rel=RALLOC(double,ncomp);
for (int i=0; i<ncomp; ++i)
{
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j)
{
double sqr=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr;
}
maxdiff=sqrt(maxdiff);
#ifdef USE_MPI
MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&sqsum[i],&sqsumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
#else
sumtot=sum;
sqsumtot=sqsum[i];
maxdifftot=maxdiff;
#endif
sumtot=sqrt(sumtot/nalms_tot);
sqsumtot=sqrt(sqsumtot/nalms_tot);
(*err_abs)[i]=maxdifftot;
(*err_rel)[i]=sumtot/sqsumtot;
}
}
static int good_fft_size(int n)
{
if (n<=6) return n;
int bestfac=2*n;
for (int f2=1; f2<bestfac; f2*=2)
for (int f23=f2; f23<bestfac; f23*=3)
for (int f235=f23; f235<bestfac; f235*=5)
if (f235>=n) bestfac=f235;
return bestfac;
}
static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
int *gpar2, sharp_geom_info **ginfo, sharp_alm_info **ainfo)
{
UTIL_ASSERT(lmax>=0,"lmax must not be negative");
if (*mmax<0) *mmax=lmax;
UTIL_ASSERT(*mmax<=lmax,"mmax larger than lmax");
if (mytask==0) printf ("lmax: %d, mmax: %d\n",lmax,*mmax);
sharp_make_triangular_alm_info(lmax,*mmax,1,ainfo);
#ifdef USE_MPI
reduce_alm_info(*ainfo);
#endif
if (strcmp(gname,"healpix")==0)
{
if (*gpar1<1) *gpar1=lmax/2;
if (*gpar1==0) ++(*gpar1);
sharp_make_healpix_geom_info (*gpar1, 1, ginfo);
if (mytask==0) printf ("HEALPix grid, nside=%d\n",*gpar1);
}
else if (strcmp(gname,"gauss")==0)
{
if (*gpar1<1) *gpar1=lmax+1;
if (*gpar2<1) *gpar2=2*(*mmax)+1;
sharp_make_gauss_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
if (mytask==0)
printf ("Gauss-Legendre grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
}
else if (strcmp(gname,"fejer1")==0)
{
if (*gpar1<1) *gpar1=2*lmax+1;
if (*gpar2<1) *gpar2=2*(*mmax)+1;
sharp_make_fejer1_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
if (mytask==0) printf ("Fejer1 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
}
else if (strcmp(gname,"fejer2")==0)
{
if (*gpar1<1) *gpar1=2*lmax+1;
if (*gpar2<1) *gpar2=2*(*mmax)+1;
sharp_make_fejer2_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
if (mytask==0) printf ("Fejer2 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
}
else if (strcmp(gname,"cc")==0)
{
if (*gpar1<1) *gpar1=2*lmax+1;
if (*gpar2<1) *gpar2=2*(*mmax)+1;
sharp_make_cc_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
if (mytask==0)
printf("Clenshaw-Curtis grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
}
else if (strcmp(gname,"smallgauss")==0)
{
int nlat=*gpar1, nlon=*gpar2;
if (nlat<1) nlat=lmax+1;
if (nlon<1) nlon=2*(*mmax)+1;
*gpar1=nlat; *gpar2=nlon;
sharp_make_gauss_geom_info (nlat, nlon, 0., 1, nlon, ginfo);
ptrdiff_t npix_o=get_npix(*ginfo);
size_t ofs=0;
for (int i=0; i<(*ginfo)->npairs; ++i)
{
sharp_ringpair *pair=&((*ginfo)->pair[i]);
int pring=1+2*sharp_get_mlim(lmax,0,pair->r1.sth,pair->r1.cth);
if (pring>nlon) pring=nlon;
pring=good_fft_size(pring);
pair->r1.nph=pring;
pair->r1.weight*=nlon*1./pring;
pair->r1.ofs=ofs;
ofs+=pring;
if (pair->r2.nph>0)
{
pair->r2.nph=pring;
pair->r2.weight*=nlon*1./pring;
pair->r2.ofs=ofs;
ofs+=pring;
}
}
if (mytask==0)
{
ptrdiff_t npix=get_npix(*ginfo);
printf("Small Gauss grid, nlat=%d, npix=%ld, savings=%.2f%%\n",
nlat,(long)npix,(npix_o-npix)*100./npix_o);
}
}
else
UTIL_FAIL("unknown grid geometry");
#ifdef USE_MPI
reduce_geom_info(*ginfo);
#endif
}
static void check_sign_scale(void)
{
int lmax=50;
int mmax=lmax;
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
/* flip theta to emulate the "old" Gaussian grid geometry */
for (int i=0; i<tinfo->npairs; ++i)
{
const double pi=3.141592653589793238462643383279502884197;
tinfo->pair[i].r1.cth=-tinfo->pair[i].r1.cth;
tinfo->pair[i].r2.cth=-tinfo->pair[i].r2.cth;
tinfo->pair[i].r1.theta=pi-tinfo->pair[i].r1.theta;
tinfo->pair[i].r2.theta=pi-tinfo->pair[i].r2.theta;
}
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
for (int ntrans=1; ntrans<10; ++ntrans)
{
double **map;
ALLOC2D(map,double,2*ntrans,npix);
dcmplx **alm;
ALLOC2D(alm,dcmplx,2*ntrans,nalms);
for (int i=0; i<2*ntrans; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[it][0 ], 3.588246976618616912e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ], 2.750897760535633285e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2], 3.137704477368562905e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-8.405730859837063917e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-2.398026536095463346e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-1.398186224727334448e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.456676000884031197e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.516249174408820863e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-3.173406200299964119e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,ntrans,
SHARP_DP,NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-6.859393905369091105e-01,1e-11),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.103947835973212364e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.092463246472086439e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-1.411433220713928165e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
"error");
}
DEALLOC2D(map);
DEALLOC2D(alm);
}
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
int spin, int ntrans, int nv, double **err_abs, double **err_rel,
double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
unsigned long long *op_m2a)
{
ptrdiff_t nalms = get_nalms(ainfo);
int ncomp = ntrans*((spin==0) ? 1 : 2);
size_t npix = get_npix(ginfo);
double **map;
ALLOC2D(map,double,ncomp,npix);
for (int i=0; i<ncomp; ++i)
SET_ARRAY(map[i],0,(int)npix,0);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],ainfo,spin,i+1);
#ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
ainfo,ntrans, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
#else
sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
SHARP_DP|nv,t_a2m,op_a2m);
#endif
if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
#ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
ainfo,ntrans,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
#else
sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
#endif
if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
get_errors(alm, nalms, ncomp, sqsum, err_abs, err_rel);
DEALLOC(sqsum);
DEALLOC2D(map);
DEALLOC2D(alm);
}
static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
int spin, int ntrans, int nv)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
double *err_abs, *err_rel;
do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel, NULL, NULL,
NULL, NULL);
for (int i=0; i<ncomp; ++i)
UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
DEALLOC(err_rel);
DEALLOC(err_abs);
}
static void sharp_acctest(void)
{
if (mytask==0) sharp_module_startup("sharp_acctest",1,1,"",1);
if (mytask==0) printf("Checking signs and scales.\n");
check_sign_scale();
if (mytask==0) printf("Passed.\n\n");
if (mytask==0) printf("Testing map analysis accuracy.\n");
sharp_geom_info *ginfo;
sharp_alm_info *ainfo;
int lmax=127, mmax=127, nlat=128, nlon=256;
get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
for (int nv=1; nv<=6; ++nv)
for (int ntrans=1; ntrans<=6; ++ntrans)
{
check_accuracy(ginfo,ainfo,0,ntrans,nv);
check_accuracy(ginfo,ainfo,1,ntrans,nv);
check_accuracy(ginfo,ainfo,2,ntrans,nv);
check_accuracy(ginfo,ainfo,3,ntrans,nv);
check_accuracy(ginfo,ainfo,30,ntrans,nv);
}
sharp_destroy_alm_info(ainfo);
sharp_destroy_geom_info(ginfo);
if (mytask==0) printf("Passed.\n\n");
}
static void sharp_test (int argc, const char **argv)
{
if (mytask==0) sharp_announce("sharp_test");
UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
int lmax=atoi(argv[3]);
int mmax=atoi(argv[4]);
int gpar1=atoi(argv[5]);
int gpar2=atoi(argv[6]);
int spin=atoi(argv[7]);
int ntrans=atoi(argv[8]);
if (mytask==0) printf("Testing map analysis accuracy.\n");
if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
sharp_geom_info *ginfo;
sharp_alm_info *ainfo;
get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double t_a2m=1e30, t_m2a=1e30;
unsigned long long op_a2m, op_m2a;
double *err_abs,*err_rel;
double t_acc=0;
int nrpt=0;
while(1)
{
++nrpt;
double ta2m2, tm2a2;
do_sht (ginfo, ainfo, spin, ntrans, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
&op_a2m, &op_m2a);
if (ta2m2<t_a2m) t_a2m=ta2m2;
if (tm2a2<t_m2a) t_m2a=tm2a2;
t_acc+=t_a2m+t_m2a;
if (t_acc>2.)
{
if (mytask==0) printf("Best of %d runs\n",nrpt);
break;
}
DEALLOC(err_abs);
DEALLOC(err_rel);
}
if (mytask==0) printf("wall time for alm2map: %fs\n",t_a2m);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*op_a2m/t_a2m);
if (mytask==0) printf("wall time for map2alm: %fs\n",t_m2a);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*op_m2a/t_m2a);
if (mytask==0)
for (int i=0; i<ncomp; ++i)
printf("component %i: rms %e, maxerr %e\n",i,err_rel[i], err_abs[i]);
double iosize = ncomp*(16.*get_nalms(ainfo) + 8.*get_npix(ginfo));
iosize = allreduceSumDouble(iosize);
sharp_destroy_alm_info(ainfo);
sharp_destroy_geom_info(ginfo);
double tmem=totalMem();
if (mytask==0)
printf("\nMemory high water mark: %.2f MB\n",tmem/(1<<20));
if (mytask==0)
printf("Memory overhead: %.2f MB (%.2f%% of working set)\n",
(tmem-iosize)/(1<<20),100.*(1.-iosize/tmem));
#ifdef _OPENMP
int nomp=omp_get_max_threads();
#else
int nomp=1;
#endif
double maxerel=0., maxeabs=0.;
for (int i=0; i<ncomp; ++i)
{
if (maxerel<err_rel[i]) maxerel=err_rel[i];
if (maxeabs<err_abs[i]) maxeabs=err_abs[i];
}
if (mytask==0)
printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
" %9.2f %6.2f %.2e %.2e\n",
getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2,
ntrans,t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
100.*(1.-iosize/tmem),maxerel,maxeabs);
DEALLOC(err_abs);
DEALLOC(err_rel);
}
static void sharp_bench (int argc, const char **argv)
{
if (mytask==0) sharp_announce("sharp_bench");
UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
int lmax=atoi(argv[3]);
int mmax=atoi(argv[4]);
int gpar1=atoi(argv[5]);
int gpar2=atoi(argv[6]);
int spin=atoi(argv[7]);
int ntrans=atoi(argv[8]);
if (mytask==0) printf("Testing map analysis accuracy.\n");
if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
sharp_geom_info *ginfo;
sharp_alm_info *ainfo;
get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
unsigned long long opa2m_min=0, opm2a_min=0;
int nvmin_a2m=-1, nvmin_m2a=-1;
for (int nv=0; nv<=6; ++nv)
{
int ntries=0;
double tacc=0;
do
{
double t_a2m, t_m2a;
unsigned long long op_a2m, op_m2a;
double *err_abs,*err_rel;
do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel,
&t_a2m, &t_m2a, &op_a2m, &op_m2a);
DEALLOC(err_abs);
DEALLOC(err_rel);
tacc+=t_a2m+t_m2a;
++ntries;
if (nv==0)
{
if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
}
else
{
if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
}
} while((ntries<2)||(tacc<3.));
}
if (mytask==0)
{
printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
1e-9*opa2m_min/ta2m_min);
printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
1e-9*opm2a_min/tm2a_min);
}
sharp_destroy_alm_info(ainfo);
sharp_destroy_geom_info(ginfo);
}
int main(int argc, const char **argv)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
#else
mytask=0; ntasks=1;
#endif
UTIL_ASSERT(argc>=2,"need at least one command line argument");
if (strcmp(argv[1],"acctest")==0)
sharp_acctest();
else if (strcmp(argv[1],"test")==0)
sharp_test(argc,argv);
else if (strcmp(argv[1],"bench")==0)
sharp_bench(argc,argv);
else
UTIL_FAIL("unknown command");
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

View File

@ -25,7 +25,7 @@
/* \file sharp_vecsupport.h /* \file sharp_vecsupport.h
* Convenience functions for vector arithmetics * Convenience functions for vector arithmetics
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012,2013 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -40,34 +40,46 @@ typedef double Ts;
#if (VLEN==1) #if (VLEN==1)
typedef double Tv; typedef double Tv;
typedef float Tv_s;
typedef int Tm;
#define vadd(a,b) ((a)+(b)) #define vadd(a,b) ((a)+(b))
#define vadd_s(a,b) ((a)+(b))
#define vaddeq(a,b) ((a)+=(b)) #define vaddeq(a,b) ((a)+=(b))
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
#define vsub(a,b) ((a)-(b)) #define vsub(a,b) ((a)-(b))
#define vsub_s(a,b) ((a)-(b))
#define vsubeq(a,b) ((a)-=(b)) #define vsubeq(a,b) ((a)-=(b))
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
#define vmul(a,b) ((a)*(b)) #define vmul(a,b) ((a)*(b))
#define vmul_s(a,b) ((a)*(b))
#define vmuleq(a,b) ((a)*=(b)) #define vmuleq(a,b) ((a)*=(b))
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
#define vfmaeq(a,b,c) ((a)+=(b)*(c)) #define vfmaeq(a,b,c) ((a)+=(b)*(c))
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
#define vfmseq(a,b,c) ((a)-=(b)*(c)) #define vfmseq(a,b,c) ((a)-=(b)*(c))
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e)) #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e)) #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
#define vneg(a) (-(a)) #define vneg(a) (-(a))
#define vload(a) (a) #define vload(a) (a)
#define vload_s(a) (a)
#define vloadu(p) (*(p))
#define vloadu_s(p) (*(p))
#define vabs(a) fabs(a) #define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a) #define vsqrt(a) sqrt(a)
#define vlt(a,b) (((a)<(b))?1.:0.) #define vlt(a,b) ((a)<(b))
#define vgt(a,b) (((a)>(b))?1.:0.) #define vgt(a,b) ((a)>(b))
#define vge(a,b) (((a)>=(b))?1.:0.) #define vge(a,b) ((a)>=(b))
#define vne(a,b) (((a)!=(b))?1.:0.) #define vne(a,b) ((a)!=(b))
#define vand(a,b) ((((a)*(b))!=0.)?1.:0.) #define vand_mask(a,b) ((a)&&(b))
#define vor(a,b) ((((a)+(b))!=0.)?1.:0.) #define vstoreu(p, a) (*(p)=a)
#define vstoreu_s(p, a) (*(p)=a)
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; } static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; } static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vanyTrue(a) ((a)!=0.) #define vanyTrue(a) (a)
#define vallTrue(a) ((a)!=0.) #define vallTrue(a) (a)
#define vblend(m,a,b) (((m)!=0.) ? (a) : (b))
#define vzero 0. #define vzero 0.
#define vone 1. #define vone 1.
@ -85,14 +97,32 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#endif #endif
typedef __m128d Tv; typedef __m128d Tv;
typedef __m128 Tv_s;
typedef __m128d Tm;
#if defined(__SSE4_1__)
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
#else
static inline Tv vblend__(Tv m, Tv a, Tv b)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vzero _mm_setzero_pd()
#define vone _mm_set1_pd(1.)
#define vadd(a,b) _mm_add_pd(a,b) #define vadd(a,b) _mm_add_pd(a,b)
#define vadd_s(a,b) _mm_add_ps(a,b)
#define vaddeq(a,b) a=_mm_add_pd(a,b) #define vaddeq(a,b) a=_mm_add_pd(a,b)
#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
#define vsub(a,b) _mm_sub_pd(a,b) #define vsub(a,b) _mm_sub_pd(a,b)
#define vsub_s(a,b) _mm_sub_ps(a,b)
#define vsubeq(a,b) a=_mm_sub_pd(a,b) #define vsubeq(a,b) a=_mm_sub_pd(a,b)
#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
#define vmul(a,b) _mm_mul_pd(a,b) #define vmul(a,b) _mm_mul_pd(a,b)
#define vmul_s(a,b) _mm_mul_ps(a,b)
#define vmuleq(a,b) a=_mm_mul_pd(a,b) #define vmuleq(a,b) a=_mm_mul_pd(a,b)
#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c)) #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c)) #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \ #define vfmaaeq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
@ -100,51 +130,61 @@ typedef __m128d Tv;
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a) #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
#define vload(a) _mm_set1_pd(a) #define vload(a) _mm_set1_pd(a)
#define vload_s(a) _mm_set1_ps(a)
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a) #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a) #define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b) #define vlt(a,b) _mm_cmplt_pd(a,b)
#define vgt(a,b) _mm_cmpgt_pd(a,b) #define vgt(a,b) _mm_cmpgt_pd(a,b)
#define vge(a,b) _mm_cmpge_pd(a,b) #define vge(a,b) _mm_cmpge_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b) #define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand(a,b) _mm_and_pd(a,b) #define vand_mask(a,b) _mm_and_pd(a,b)
#define vor(a,b) _mm_or_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b) #define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b); #define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0) #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3) #define vallTrue(a) (_mm_movemask_pd(a)==3)
#if defined(__SSE4_1__) #define vloadu(p) _mm_loadu_pd(p)
#define vblend(m,a,b) _mm_blendv_pd(b,a,m) #define vloadu_s(p) _mm_loadu_ps(p)
#else #define vstoreu(p, v) _mm_storeu_pd(p, v)
static inline Tv vblend(Tv m, Tv a, Tv b) #define vstoreu_s(p, v) _mm_storeu_ps(p, v)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vzero _mm_setzero_pd()
#define vone _mm_set1_pd(1.)
#endif #endif
#if (VLEN==4) #if (VLEN==4)
#include <immintrin.h> #include <immintrin.h>
#ifdef __FMA4__ #if (USE_FMA4)
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
typedef __m256d Tv; typedef __m256d Tv;
typedef __m256 Tv_s;
typedef __m256d Tm;
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
#define vzero _mm256_setzero_pd()
#define vone _mm256_set1_pd(1.)
#define vadd(a,b) _mm256_add_pd(a,b) #define vadd(a,b) _mm256_add_pd(a,b)
#define vadd_s(a,b) _mm256_add_ps(a,b)
#define vaddeq(a,b) a=_mm256_add_pd(a,b) #define vaddeq(a,b) a=_mm256_add_pd(a,b)
#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
#define vsub(a,b) _mm256_sub_pd(a,b) #define vsub(a,b) _mm256_sub_pd(a,b)
#define vsub_s(a,b) _mm256_sub_ps(a,b)
#define vsubeq(a,b) a=_mm256_sub_pd(a,b) #define vsubeq(a,b) a=_mm256_sub_pd(a,b)
#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
#define vmul(a,b) _mm256_mul_pd(a,b) #define vmul(a,b) _mm256_mul_pd(a,b)
#define vmul_s(a,b) _mm256_mul_ps(a,b)
#define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq(a,b) a=_mm256_mul_pd(a,b)
#ifdef __FMA4__ #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
#if (USE_FMA4)
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#else #else
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \ #define vfmaaeq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
@ -153,21 +193,62 @@ typedef __m256d Tv;
#endif #endif
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
#define vload(a) _mm256_set1_pd(a) #define vload(a) _mm256_set1_pd(a)
#define vload_s(a) _mm256_set1_ps(a)
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a) #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a) #define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ) #define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ) #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand(a,b) _mm256_and_pd(a,b) #define vand_mask(a,b) _mm256_and_pd(a,b)
#define vor(a,b) _mm256_or_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b) #define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b) #define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0) #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15) #define vallTrue(a) (_mm256_movemask_pd(a)==15)
#define vblend(m,a,b) _mm256_blendv_pd(b,a,m)
#define vzero _mm256_setzero_pd() #define vloadu(p) _mm256_loadu_pd(p)
#define vone _mm256_set1_pd(1.) #define vloadu_s(p) _mm256_loadu_ps(p)
#define vstoreu(p, v) _mm256_storeu_pd(p, v)
#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
#endif
#if (VLEN==8)
#include <immintrin.h>
typedef __m512d Tv;
typedef __mmask8 Tm;
#define vadd(a,b) _mm512_add_pd(a,b)
#define vaddeq(a,b) a=_mm512_add_pd(a,b)
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
#define vsub(a,b) _mm512_sub_pd(a,b)
#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
#define vmul(a,b) _mm512_mul_pd(a,b)
#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
#define vload(a) _mm512_set1_pd(a)
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
#define vsqrt(a) _mm512_sqrt_pd(a)
#define vlt(a,b) _mm512_cmplt_pd_mask(a,b)
#define vgt(a,b) _mm512_cmpnle_pd_mask(a,b)
#define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
#define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
#define vand_mask(a,b) ((a)&(b))
#define vmin(a,b) _mm512_min_pd(a,b)
#define vmax(a,b) _mm512_max_pd(a,b)
#define vanyTrue(a) (a!=0)
#define vallTrue(a) (a==255)
#define vzero _mm512_setzero_pd()
#define vone _mm512_set1_pd(1.)
#endif #endif

View File

@ -25,14 +25,18 @@
/*! \file sharp_vecutil.h /*! \file sharp_vecutil.h
* Functionality related to vector instruction support * Functionality related to vector instruction support
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012,2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
#ifndef SHARP_VECUTIL_H #ifndef SHARP_VECUTIL_H
#define SHARP_VECUTIL_H #define SHARP_VECUTIL_H
#if (defined (__AVX__)) #ifndef VLEN
#if (defined (__MIC__))
#define VLEN 8
#elif (defined (__AVX__))
#define VLEN 4 #define VLEN 4
#elif (defined (__SSE2__)) #elif (defined (__SSE2__))
#define VLEN 2 #define VLEN 2
@ -41,3 +45,19 @@
#endif #endif
#endif #endif
#if (VLEN==1)
#define VLEN_s 1
#else
#define VLEN_s (2*VLEN)
#endif
#ifndef USE_FMA4
#ifdef __FMA4__
#define USE_FMA4 1
#else
#define USE_FMA4 0
#endif
#endif
#endif

View File

@ -25,7 +25,7 @@
/* /*
* Helper code for efficient calculation of Y_lm(theta,phi=0) * Helper code for efficient calculation of Y_lm(theta,phi=0)
* *
* Copyright (C) 2005-2012 Max-Planck-Society * Copyright (C) 2005-2014 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -47,7 +47,9 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
gen->lmax = l_max; gen->lmax = l_max;
gen->mmax = m_max; gen->mmax = m_max;
UTIL_ASSERT(spin>=0,"incorrect spin"); UTIL_ASSERT(spin>=0,"incorrect spin: must be nonnegative");
UTIL_ASSERT(l_max>=spin,"incorrect l_max: must be >= spin");
UTIL_ASSERT(l_max>=m_max,"incorrect l_max: must be >= m_max");
gen->s = spin; gen->s = spin;
UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0), UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0),
"bad value for min/maxscale"); "bad value for min/maxscale");

View File

@ -0,0 +1 @@
# work around broken setuptools monkey patching

View File

@ -0,0 +1 @@
build_ext = "yes, it's there!"

View File

@ -0,0 +1 @@
# work around broken setuptools monkey patching

View File

@ -0,0 +1,2 @@
This directory is here to fool setuptools into building .pyx files
even if Pyrex is not installed. See ../setup.py.

View File

@ -0,0 +1 @@
from .libsharp import *

View File

@ -0,0 +1,79 @@
cdef extern from "sharp.h":
ctypedef long ptrdiff_t
void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
float *out, ptrdiff_t nx)
void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
double *out, ptrdiff_t nx)
void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax)
void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax)
void sharp_legendre_roots(int n, double *x, double *w)
# sharp_lowlevel.h
ctypedef struct sharp_alm_info:
pass
ctypedef struct sharp_geom_info:
pass
void sharp_make_alm_info (int lmax, int mmax, int stride,
ptrdiff_t *mvstart, sharp_alm_info **alm_info)
void sharp_make_geom_info (int nrings, int *nph, ptrdiff_t *ofs,
int *stride, double *phi0, double *theta,
double *wgt, sharp_geom_info **geom_info)
void sharp_destroy_alm_info(sharp_alm_info *info)
void sharp_destroy_geom_info(sharp_geom_info *info)
ptrdiff_t sharp_map_size(sharp_geom_info *info)
ptrdiff_t sharp_alm_count(sharp_alm_info *self)
ctypedef enum sharp_jobtype:
SHARP_YtW
SHARP_Yt
SHARP_WY
SHARP_Y
ctypedef enum:
SHARP_DP
SHARP_ADD
void sharp_execute(sharp_jobtype type_,
int spin,
void *alm,
void *map,
sharp_geom_info *geom_info,
sharp_alm_info *alm_info,
int ntrans,
int flags,
double *time,
unsigned long long *opcnt) nogil
ctypedef enum:
SHARP_ERROR_NO_MPI
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
void *alm, void *map, sharp_geom_info *geom_info,
sharp_alm_info *alm_info, int ntrans, int flags, double *time,
unsigned long long *opcnt) nogil
cdef extern from "sharp_geomhelpers.h":
void sharp_make_subset_healpix_geom_info(
int nside, int stride, int nrings,
int *rings, double *weight, sharp_geom_info **geom_info)
void sharp_make_gauss_geom_info(
int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
cdef extern from "sharp_almhelpers.h":
void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info)
void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info)
void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
int nm, const int *ms, sharp_alm_info **alm_info)

View File

@ -0,0 +1,256 @@
import numpy as np
__all__ = ['legendre_transform', 'legendre_roots', 'sht', 'synthesis', 'adjoint_synthesis',
'analysis', 'adjoint_analysis', 'healpix_grid', 'triangular_order', 'rectangular_order',
'packed_real_order']
def legendre_transform(x, bl, out=None):
if out is None:
out = np.empty_like(x)
if out.shape[0] == 0:
return out
elif x.dtype == np.float64:
if bl.dtype != np.float64:
bl = bl.astype(np.float64)
return _legendre_transform(x, bl, out=out)
elif x.dtype == np.float32:
if bl.dtype != np.float32:
bl = bl.astype(np.float32)
return _legendre_transform_s(x, bl, out=out)
else:
raise ValueError("unsupported dtype")
def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out):
if out.shape[0] != x.shape[0]:
raise ValueError('x and out must have same shape')
sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
return np.asarray(out)
def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out):
if out.shape[0] != x.shape[0]:
raise ValueError('x and out must have same shape')
sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
return np.asarray(out)
def legendre_roots(n):
x = np.empty(n, np.double)
w = np.empty(n, np.double)
cdef double[::1] x_buf = x, w_buf = w
if not (x_buf.shape[0] == w_buf.shape[0] == n):
raise AssertionError()
if n > 0:
sharp_legendre_roots(n, &x_buf[0], &w_buf[0])
return x, w
JOBTYPE_TO_CONST = {
'Y': SHARP_Y,
'Yt': SHARP_Yt,
'WY': SHARP_WY,
'YtW': SHARP_YtW
}
def sht(jobtype, geom_info ginfo, alm_info ainfo, double[:, :, ::1] input,
int spin=0, comm=None, add=False):
cdef void *comm_ptr
cdef int flags = SHARP_DP | (SHARP_ADD if add else 0)
cdef int r
cdef sharp_jobtype jobtype_i
cdef double[:, :, ::1] output_buf
cdef int ntrans = input.shape[0] * input.shape[1]
cdef int i, j
if spin == 0 and input.shape[1] != 1:
raise ValueError('For spin == 0, we need input.shape[1] == 1')
elif spin != 0 and input.shape[1] != 2:
raise ValueError('For spin != 0, we need input.shape[1] == 2')
cdef size_t[::1] ptrbuf = np.empty(2 * ntrans, dtype=np.uintp)
cdef double **alm_ptrs = <double**>&ptrbuf[0]
cdef double **map_ptrs = <double**>&ptrbuf[ntrans]
try:
jobtype_i = JOBTYPE_TO_CONST[jobtype]
except KeyError:
raise ValueError('jobtype must be one of: %s' % ', '.join(sorted(JOBTYPE_TO_CONST.keys())))
if jobtype_i == SHARP_Y or jobtype_i == SHARP_WY:
output = np.empty((input.shape[0], input.shape[1], ginfo.local_size()), dtype=np.float64)
output_buf = output
for i in range(input.shape[0]):
for j in range(input.shape[1]):
alm_ptrs[i * input.shape[1] + j] = &input[i, j, 0]
map_ptrs[i * input.shape[1] + j] = &output_buf[i, j, 0]
else:
output = np.empty((input.shape[0], input.shape[1], ainfo.local_size()), dtype=np.float64)
output_buf = output
for i in range(input.shape[0]):
for j in range(input.shape[1]):
alm_ptrs[i * input.shape[1] + j] = &output_buf[i, j, 0]
map_ptrs[i * input.shape[1] + j] = &input[i, j, 0]
if comm is None:
with nogil:
sharp_execute (
jobtype_i,
geom_info=ginfo.ginfo, alm_info=ainfo.ainfo,
spin=spin, alm=alm_ptrs, map=map_ptrs,
ntrans=ntrans, flags=flags, time=NULL, opcnt=NULL)
else:
from mpi4py import MPI
if not isinstance(comm, MPI.Comm):
raise TypeError('comm must be an mpi4py communicator')
from .libsharp_mpi import _addressof
comm_ptr = <void*><size_t>_addressof(comm)
with nogil:
r = sharp_execute_mpi_maybe (
comm_ptr, jobtype_i,
geom_info=ginfo.ginfo, alm_info=ainfo.ainfo,
spin=spin, alm=alm_ptrs, map=map_ptrs,
ntrans=ntrans, flags=flags, time=NULL, opcnt=NULL)
if r == SHARP_ERROR_NO_MPI:
raise Exception('MPI requested, but not available')
return output
def synthesis(*args, **kw):
return sht('Y', *args, **kw)
def adjoint_synthesis(*args, **kw):
return sht('Yt', *args, **kw)
def analysis(*args, **kw):
return sht('YtW', *args, **kw)
def adjoint_analysis(*args, **kw):
return sht('WY', *args, **kw)
#
# geom_info
#
class NotInitializedError(Exception):
pass
cdef class geom_info:
cdef sharp_geom_info *ginfo
def __cinit__(self, *args, **kw):
self.ginfo = NULL
def local_size(self):
if self.ginfo == NULL:
raise NotInitializedError()
return sharp_map_size(self.ginfo)
def __dealloc__(self):
if self.ginfo != NULL:
sharp_destroy_geom_info(self.ginfo)
self.ginfo = NULL
cdef class healpix_grid(geom_info):
_weight_cache = {} # { (nside, 'T'/'Q'/'U') -> numpy array of ring weights cached from file }
def __init__(self, int nside, stride=1, int[::1] rings=None, double[::1] weights=None):
if weights is not None and weights.shape[0] != 2 * nside:
raise ValueError('weights must have length 2 * nside')
sharp_make_subset_healpix_geom_info(nside, stride,
nrings=4 * nside - 1 if rings is None else rings.shape[0],
rings=NULL if rings is None else &rings[0],
weight=NULL if weights is None else &weights[0],
geom_info=&self.ginfo)
@classmethod
def load_ring_weights(cls, nside, fields):
"""
Loads HEALPix ring weights from file. The environment variable
HEALPIX should be set, and this routine will look in the `data`
subdirectory.
Parameters
----------
nside: int
HEALPix nside parameter
fields: tuple of str
Which weights to extract; pass ('T',) to only get scalar
weights back, or ('T', 'Q', 'U') to get all the weights
Returns
-------
List of NumPy arrays, according to fields parameter.
"""
import os
from astropy.io import fits
data_path = os.path.join(os.environ['HEALPIX'], 'data')
fits_field_names = {
'T': 'TEMPERATURE WEIGHTS',
'Q': 'Q-POLARISATION WEIGHTS',
'U': 'U-POLARISATION WEIGHTS'}
must_load = [field for field in fields if (nside, field) not in cls._weight_cache]
if must_load:
hdulist = fits.open(os.path.join(data_path, 'weight_ring_n%05d.fits' % nside))
try:
for field in must_load:
w = hdulist[1].data.field(fits_field_names[field]).ravel().astype(np.double)
w += 1
cls._weight_cache[nside, field] = w
finally:
hdulist.close()
return [cls._weight_cache[(nside, field)].copy() for field in fields]
#
# alm_info
#
cdef class alm_info:
cdef sharp_alm_info *ainfo
def __cinit__(self, *args, **kw):
self.ainfo = NULL
def local_size(self):
if self.ainfo == NULL:
raise NotInitializedError()
return sharp_alm_count(self.ainfo)
def __dealloc__(self):
if self.ainfo != NULL:
sharp_destroy_alm_info(self.ainfo)
self.ainfo = NULL
cdef class triangular_order(alm_info):
def __init__(self, int lmax, mmax=None, stride=1):
mmax = mmax if mmax is not None else lmax
sharp_make_triangular_alm_info(lmax, mmax, stride, &self.ainfo)
cdef class rectangular_order(alm_info):
def __init__(self, int lmax, mmax=None, stride=1):
mmax = mmax if mmax is not None else lmax
sharp_make_rectangular_alm_info(lmax, mmax, stride, &self.ainfo)
cdef class packed_real_order(alm_info):
def __init__(self, int lmax, stride=1, int[::1] ms=None):
sharp_make_mmajor_real_packed_alm_info(lmax=lmax, stride=stride,
nm=lmax + 1 if ms is None else ms.shape[0],
ms=NULL if ms is None else &ms[0],
alm_info=&self.ainfo)

View File

@ -0,0 +1,17 @@
cdef extern from "mpi.h":
ctypedef void *MPI_Comm
cdef extern from "Python.h":
object PyLong_FromVoidPtr(void*)
cdef extern:
ctypedef class mpi4py.MPI.Comm [object PyMPICommObject]:
cdef MPI_Comm ob_mpi
cdef unsigned flags
# For compatibility with mpi4py <= 1.3.1
# Newer versions could use the MPI._addressof function
def _addressof(Comm comm):
cdef void *ptr = NULL
ptr = <void*>&comm.ob_mpi
return PyLong_FromVoidPtr(ptr)

View File

@ -0,0 +1 @@
# empty

View File

@ -0,0 +1,59 @@
import numpy as np
from scipy.special import legendre
from scipy.special import p_roots
import libsharp
from numpy.testing import assert_allclose
def check_legendre_transform(lmax, ntheta):
l = np.arange(lmax + 1)
if lmax >= 1:
sigma = -np.log(1e-3) / lmax / (lmax + 1)
bl = np.exp(-sigma*l*(l+1))
bl *= (2 * l + 1)
else:
bl = np.asarray([1], dtype=np.double)
theta = np.linspace(0, np.pi, ntheta, endpoint=True)
x = np.cos(theta)
# Compute truth using scipy.special.legendre
P = np.zeros((ntheta, lmax + 1))
for l in range(lmax + 1):
P[:, l] = legendre(l)(x)
y0 = np.dot(P, bl)
# double-precision
y = libsharp.legendre_transform(x, bl)
assert_allclose(y, y0, rtol=1e-12, atol=1e-12)
# single-precision
y32 = libsharp.legendre_transform(x.astype(np.float32), bl)
assert_allclose(y, y0, rtol=1e-5, atol=1e-5)
def test_legendre_transform():
nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20))
for ntheta in nthetas_to_try:
for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)):
yield check_legendre_transform, lmax, ntheta
def check_legendre_roots(n):
xs, ws = ([], []) if n == 0 else p_roots(n) # from SciPy
xl, wl = libsharp.legendre_roots(n)
assert_allclose(xs, xl, rtol=1e-14, atol=1e-14)
assert_allclose(ws, wl, rtol=1e-14, atol=1e-14)
def test_legendre_roots():
"""
Test the Legendre root-finding algorithm from libsharp by comparing it with
the SciPy version.
"""
yield check_legendre_roots, 0
yield check_legendre_roots, 1
yield check_legendre_roots, 32
yield check_legendre_roots, 33
yield check_legendre_roots, 128

View File

@ -0,0 +1,34 @@
import numpy as np
import healpy
from scipy.special import legendre
from scipy.special import p_roots
from numpy.testing import assert_allclose
import libsharp
from mpi4py import MPI
def test_basic():
lmax = 10
nside = 8
rank = MPI.COMM_WORLD.Get_rank()
ms = np.arange(rank, lmax + 1, MPI.COMM_WORLD.Get_size(), dtype=np.int32)
order = libsharp.packed_real_order(lmax, ms=ms)
grid = libsharp.healpix_grid(nside)
alm = np.zeros(order.local_size())
if rank == 0:
alm[0] = 1
elif rank == 1:
alm[0] = 1
map = libsharp.synthesis(grid, order, np.repeat(alm[None, None, :], 3, 0), comm=MPI.COMM_WORLD)
assert np.all(map[2, :] == map[1, :]) and np.all(map[1, :] == map[0, :])
map = map[0, 0, :]
if rank == 0:
healpy.mollzoom(map)
from matplotlib.pyplot import show
show()

77
external/sharp/python/setup.py vendored Normal file
View File

@ -0,0 +1,77 @@
#! /usr/bin/env python
descr = """Spherical Harmionic transforms package
Python API for the libsharp spherical harmonic transforms library
"""
import os
import sys
DISTNAME = 'libsharp'
DESCRIPTION = 'libsharp library for fast Spherical Harmonic Transforms'
LONG_DESCRIPTION = descr
MAINTAINER = 'Dag Sverre Seljebotn',
MAINTAINER_EMAIL = 'd.s.seljebotn@astro.uio.no',
URL = 'http://sourceforge.net/projects/libsharp/'
LICENSE = 'GPL'
DOWNLOAD_URL = "http://sourceforge.net/projects/libsharp/"
VERSION = '0.1'
# Add our fake Pyrex at the end of the Python search path
# in order to fool setuptools into allowing compilation of
# pyx files to C files. Importing Cython.Distutils then
# makes Cython the tool of choice for this rather than
# (the possibly nonexisting) Pyrex.
project_path = os.path.split(__file__)[0]
sys.path.append(os.path.join(project_path, 'fake_pyrex'))
from setuptools import setup, find_packages, Extension
from Cython.Distutils import build_ext
import numpy as np
libsharp = os.environ.get('LIBSHARP', None)
libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include'))
libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib'))
if libsharp_include is None or libsharp_lib is None:
sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, '
'this script will refer to the lib and include sub-directories. Alternatively '
'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n')
sys.exit(1)
if __name__ == "__main__":
setup(install_requires = ['numpy'],
packages = find_packages(),
test_suite="nose.collector",
# Well, technically zipping the package will work, but since it's
# all compiled code it'll just get unzipped again at runtime, which
# is pointless:
zip_safe = False,
name = DISTNAME,
version = VERSION,
maintainer = MAINTAINER,
maintainer_email = MAINTAINER_EMAIL,
description = DESCRIPTION,
license = LICENSE,
url = URL,
download_url = DOWNLOAD_URL,
long_description = LONG_DESCRIPTION,
classifiers =
[ 'Development Status :: 3 - Alpha',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Topic :: Scientific/Engineering'],
cmdclass = {"build_ext": build_ext},
ext_modules = [
Extension("libsharp.libsharp",
["libsharp/libsharp.pyx"],
libraries=["sharp", "fftpack", "c_utils"],
include_dirs=[libsharp_include],
library_dirs=[libsharp_lib],
extra_link_args=["-fopenmp"],
)
],
)

19
external/sharp/runjinja.py vendored Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
"""
Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT.
"""
import sys
import hashlib
from jinja2 import Template, Environment
env = Environment(block_start_string='/*{',
block_end_string='}*/',
variable_start_string='{{',
variable_end_string='}}')
extra_vars = dict(len=len)
input = sys.stdin.read()
sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input).hexdigest())
sys.stdout.write(env.from_string(input).render(**extra_vars))