SHTns 3.7
sht_func.c
1/*
2 * Copyright (c) 2010-2021 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@univ-grenoble-alpes.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
33
38void SH_Zrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
39{
40 const long lmax = shtns->lmax; const int mmax = shtns->mmax; const int mres = shtns->mres;
41
42 if (Rlm != Qlm) { // copy m=0 which does not change.
43 long l=0; do { Rlm[l] = Qlm[l]; } while(++l <= lmax);
44 }
45
46 const double alpha_half_turns = alpha * (-2./M_PI); // negative sign: rotate frame by angle -alpha
47 for (int im=1; im<=mmax; im++) {
48 const long lm0 = LiM(shtns,0,im);
49 const double k = im*mres;
50 // custom range reduction (very accurate as long as n is less than 9e15)
51 double n = round(k*alpha_half_turns); // number of half turns
52 double k_alpha = fma(k,alpha_half_turns, -n) * (0.5*M_PI); // should be between -pi/4 and pi/4
53 long quadrant = n;
54 cplx eima = cos(k_alpha) + I*sin(k_alpha);
55 if (quadrant & 2) eima = -eima;
56 if (quadrant & 1) eima = I*eima;
57 for (long l=im*mres; l<=lmax; ++l) Rlm[lm0+l] = Qlm[lm0+l] * eima;
58 }
59}
60
61/* // For reference: accurate recurrence, but less accurate than custom angle reduction above
62 alpha *= -mres; // negative sign: rotate frame by angle -alpha
63 const double c0 = cos(alpha);
64 const double s0 = sin(alpha);
65 double t0 = 0.0;
66 if (mmax > 1) t0 = tan(0.5*alpha);
67 cplx eima = c0 + I*s0;
68 for (int im=1; im<=mmax; im++) {
69 const long lm0 = LiM(shtns,0,im);
70 for (long l=im*mres; l<=lmax; ++l) Rlm[lm0+l] = Qlm[lm0+l] * eima;
71
72 // https://stackoverflow.com/questions/51735576/fast-and-accurate-iterative-generation-of-sine-and-cosine-for-equally-spaced-ang
73 eima -= s0*(t0*eima - I*eima);
74 }
75*/
76
78
80static void SH_rotK90_init(shtns_cfg shtns)
81{
82 cplx *q;
83 double *q0;
84 int nfft, nrembed, ncembed;
85
86// if ((shtns->mres != 1) || (shtns->mmax != shtns->lmax)) runerr("Arbitrary rotations require lmax=mmax and mres=1");
87
88#define NWAY 4
89
90 const int lmax = shtns->lmax;
91 const int fac = 2*VSIZE2*NWAY; // we need a multiple of 2*VSIZE2*NWAY ...
92 const int ntheta = fft_int( ((lmax+fac)/fac) , 7) * fac; // ... and also an fft-friendly value
93
94 // generate the equispaced grid for synthesis
95 shtns->ct_rot = malloc( sizeof(double)*ntheta );
96 shtns->st_rot = shtns->ct_rot + (ntheta/2);
97 for (int k=0; k<ntheta/2; ++k) {
98 double cost = cos(((0.5*M_PI)*(2*k+1))/ntheta);
99 double sint = sqrt((1.0-cost)*(1.0+cost));
100 shtns->ct_rot[k] = cost;
101 shtns->st_rot[k] = sint;
102 }
103
104 // plan FFT
105 size_t sze = sizeof(double)*(2*ntheta+2)*lmax;
106 q0 = VMALLOC(sze); // alloc.
107 #ifdef OMP_FFTW
108 int k = (lmax < 63) ? 1 : shtns->nthreads;
109 fftw_plan_with_nthreads(k);
110 #endif
111 q = (cplx*) q0; // in-place FFT
112 nfft = 2*ntheta; nrembed = nfft+2; ncembed = nrembed/2;
113 shtns->fft_rot = fftw_plan_many_dft_r2c(1, &nfft, lmax, q0, &nrembed, lmax, 1, q, &ncembed, lmax, 1, FFTW_ESTIMATE); // FFTW_ESTIMATE is suboptimal, but these routines are not used anymore...
114
115 VFREE(q0);
116 shtns->npts_rot = ntheta; // save ntheta, and mark as initialized.
117}
118
126static void SH_rotK90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm, double dphi0, double dphi1)
127{
128// if (shtns->npts_rot == 0) SH_rotK90_init(shtns);
129
130// ticks tik0, tik1, tik2, tik3;
131
132 const int lmax = shtns->lmax;
133 const int ntheta = shtns->npts_rot;
134 size_t sze = sizeof(double)*(2*ntheta+2)*lmax;
135 double* const q0 = VMALLOC(sze); // alloc.
136
137 // rotate around Z by dphi0, and also pre-multiply imaginary parts by m
138 if (Rlm != Qlm) { // copy m=0 which does not change.
139 long l=0; do { Rlm[l] = Qlm[l]; } while(++l <= lmax);
140 }
141 for (int m=1; m<=lmax; m++) {
142 cplx eima = cos(m*dphi0) - I*sin(m*dphi0); // rotate reference frame by angle -dphi0
143 long lm = LiM(shtns,m,m);
144 double em = m;
145 for (long l=m; l<=lmax; ++l) {
146 cplx qrot = Qlm[lm] * eima;
147 ((double*)Rlm)[2*lm] = creal(qrot);
148 ((double*)Rlm)[2*lm+1] = cimag(qrot) * em; // multiply every imaginary part by m (part of im/sin(theta)*Ylm)
149 lm++;
150 }
151 }
152 Qlm = Rlm;
153
154// tik0 = getticks();
155
156 rnd* const qve = (rnd*) VMALLOC( sizeof(rnd)*NWAY*4*lmax ); // vector buffer
157 rnd* const qvo = qve + NWAY*2*lmax; // for odd m
158 double* const ct = shtns->ct_rot;
159 double* const st = shtns->st_rot;
160 double* const alm = shtns->alm;
161 const long nk = ntheta/(2*VSIZE2); // ntheta is a multiple of (2*VSIZE2)
162 long k = 0;
163 do {
164 rnd cost[NWAY], y0[NWAY], y1[NWAY];
165 long l=0;
166 // m=0
167 double* al = alm;
168 for (int j=0; j<NWAY; ++j) {
169 y0[j] = vall(al[0]) / vread(st, j+k); // l=0 (discarded) DIVIDED by sin(theta) [to be consistent with m>0]
170 cost[j] = vread(ct, j+k);
171 }
172 for (int j=0; j<NWAY; ++j) {
173 y1[j] = vall(al[1]) * y0[j] * cost[j];
174 }
175 al += 2; l+=2;
176 while(l<=lmax) {
177 for (int j=0; j<NWAY; ++j) {
178 qve[ (l-2)*2*NWAY + 2*j] = y1[j] * vall(creal(Qlm[l-1])); // l-1
179 qve[ (l-2)*2*NWAY + 2*j+1] = vall(0.0);
180 qvo[ (l-2)*2*NWAY + 2*j] = vall(0.0);
181 qvo[ (l-2)*2*NWAY + 2*j+1] = vall(0.0);
182 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
183 }
184 for (int j=0; j<NWAY; ++j) {
185 qve[ (l-1)*2*NWAY + 2*j] = y0[j] * vall(creal(Qlm[l])); // l
186 qve[ (l-1)*2*NWAY + 2*j+1] = vall(0.0);
187 qvo[ (l-1)*2*NWAY + 2*j] = vall(0.0);
188 qvo[ (l-1)*2*NWAY + 2*j+1] = vall(0.0);
189 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
190 }
191 al+=4; l+=2;
192 }
193 if (l==lmax+1) {
194 for (int j=0; j<NWAY; ++j) {
195 qve[ (l-2)*2*NWAY + 2*j] = y1[j] * vall(creal(Qlm[l-1])); // l-1
196 qve[ (l-2)*2*NWAY + 2*j+1] = vall(0.0);
197 qvo[ (l-2)*2*NWAY + 2*j] = vall(0.0);
198 qvo[ (l-2)*2*NWAY + 2*j+1] = vall(0.0);
199 }
200 }
201 // m > 0
202 for (long m=1; m<=lmax; ++m) {
203 rnd* qv = qve;
204 if (m&1) qv = qvo; // store even and odd m separately.
205 double* al = shtns->alm + m*(2*(lmax+1) -m+1);
206 cplx* Ql = &Qlm[LiM(shtns, 0,m)]; // virtual pointer for l=0 and m
207 rnd cost[NWAY], y0[NWAY], y1[NWAY];
208 for (int j=0; j<NWAY; ++j) {
209 cost[j] = vread(st, k+j);
210 y0[j] = vall(2.0); // *2 for m>0
211 }
212 long l=m-1;
213 long int ny = 0;
214 if ((int)lmax <= SHT_L_RESCALE_FLY) {
215 do { // sin(theta)^(m-1)
216 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
217 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
218 } while(l >>= 1);
219 } else {
220 long int nsint = 0;
221 do { // sin(theta)^(m-1) (use rescaling to avoid underflow)
222 if (l&1) {
223 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
224 ny += nsint;
225 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
226 ny--;
227 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
228 }
229 }
230 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
231 nsint += nsint;
232 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
233 nsint--;
234 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
235 }
236 } while(l >>= 1);
237 }
238 for (int j=0; j<NWAY; ++j) {
239 y0[j] *= vall(al[0]);
240 cost[j] = vread(ct, j+k);
241 }
242 for (int j=0; j<NWAY; ++j) {
243 y1[j] = (vall(al[1])*y0[j]) *cost[j];
244 }
245 l=m; al+=2;
246 while ((ny<0) && (l<lmax)) { // ylm treated as zero and ignored if ny < 0
247 for (int j=0; j<NWAY; ++j) {
248 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
249 }
250 for (int j=0; j<NWAY; ++j) {
251 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
252 }
253 l+=2; al+=4;
254 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
255 ++ny;
256 for (int j=0; j<NWAY; ++j) {
257 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
258 }
259 }
260 }
261 if (ny == 0) {
262 while (l<lmax) {
263 rnd qr = vall(creal(Ql[l])); rnd qi = vall(cimag(Ql[l]));
264 for (int j=0; j<NWAY; ++j) {
265 qv[ (l-1)*2*NWAY + 2*j] += y0[j] * qr; // l
266 qv[ (l-1)*2*NWAY + 2*j+1] += y0[j] * qi;
267 }
268 qr = vall(creal(Ql[l+1])); qi = vall(cimag(Ql[l+1]));
269 for (int j=0; j<NWAY; ++j) {
270 qv[ (l)*2*NWAY + 2*j] += y1[j] * qr; // l+1
271 qv[ (l)*2*NWAY + 2*j+1] += y1[j] * qi;
272 }
273 for (int j=0; j<NWAY; ++j) {
274 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j]; // l+2
275 }
276 for (int j=0; j<NWAY; ++j) {
277 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j]; // l+3
278 }
279 l+=2; al+=4;
280 }
281 if (l==lmax) {
282 rnd qr = vall(creal(Ql[l])); rnd qi = vall(cimag(Ql[l]));
283 for (int j=0; j<NWAY; ++j) {
284 qv[ (l-1)*2*NWAY + 2*j] += y0[j] * qr; // l
285 qv[ (l-1)*2*NWAY + 2*j+1] += y0[j] * qi;
286 }
287 }
288 }
289 }
290 // construct ring using symmetries + transpose...
291 double signl = -1.0;
292 double* qse = (double*) qve;
293 double* qso = (double*) qvo;
294 for (long l=1; l<=lmax; ++l) {
295 for (int j=0; j<NWAY; j++) {
296 for (int i=0; i<VSIZE2; i++) {
297 double qre = qse[(l-1)*2*NWAY*VSIZE2 + 2*j*VSIZE2 + i]; // m even
298 double qie = qse[(l-1)*2*NWAY*VSIZE2 + (2*j+1)*VSIZE2 + i];
299 double qro = qso[(l-1)*2*NWAY*VSIZE2 + 2*j*VSIZE2 + i]; // m odd
300 double qio = qso[(l-1)*2*NWAY*VSIZE2 + (2*j+1)*VSIZE2 + i];
301 long ijk = (k+j)*VSIZE2 + i;
302 qre *= st[ijk]; qro *= st[ijk]; // multiply real part by sin(theta) [to get Ylm from Ylm/sin(theta)]
303 // because qr and qi map on different parities with respect to the future Fourier tranform, we can add them !!
304 // note that this may result in leak between even and odd m's if their amplitude is widely different.
305 q0[ijk*lmax +(l-1)] = (qre + qro) - (qie + qio);
306 q0[(ntheta+ijk)*lmax +(l-1)] = ((qre + qro) + (qie + qio)) * signl; // * (-1)^l
307 q0[(2*ntheta-1-ijk)*lmax +(l-1)] = (qre - qro) + (qie - qio);
308 q0[(ntheta-1-ijk)*lmax +(l-1)] = ((qre - qro) - (qie - qio)) * signl; // (-1)^(l-m)
309 }
310 }
311 signl *= -1.0;
312 }
313 k += NWAY;
314 } while (k<nk);
315 VFREE(qve);
316#undef NWAY
317
318// tik1 = getticks();
319
320 // perform FFT
321 cplx* q = (cplx*) q0; // in-place FFT
322 fftw_execute_dft_r2c(shtns->fft_rot, q0, q);
323
324// tik2 = getticks();
325
326 const int nphi = 2*ntheta;
327 double ydyl[lmax+1];
328 long m=0; long lm=1; // start at l=1,m=0
329 long l;
330 legendre_sphPlm_deriv_array_equ(shtns, lmax, m, ydyl+m);
331 for (l=1; l<lmax; l+=2) {
332 Rlm[lm] = -creal(q[m*lmax +(l-1)])/(ydyl[l]*nphi);
333 Rlm[lm+1] = creal(q[m*lmax +l])/(ydyl[l+1]*nphi);
334 lm+=2;
335 }
336 if (l==lmax) {
337 Rlm[lm] = -creal(q[m*lmax +(l-1)])/(ydyl[l]*nphi);
338 lm+=1;
339 }
340 dphi1 += M_PI/nphi; // shift rotation angle by angle of first synthesis latitude.
341 for (m=1; m<=lmax; ++m) {
342 legendre_sphPlm_deriv_array_equ(shtns, lmax, m, ydyl+m);
343 cplx eimdp = (cos(m*dphi1) - I*sin(m*dphi1))/nphi;
344 for (l=m; l<lmax; l+=2) {
345 Rlm[lm] = eimdp*q[m*lmax +(l-1)]*(1./ydyl[l]);
346 Rlm[lm+1] = eimdp*q[m*lmax +l]*(-1./ydyl[l+1]);
347 lm+=2;
348 }
349 if (l==lmax) {
350 Rlm[lm] = eimdp*q[m*lmax +(l-1)]*(1./ydyl[l]);
351 lm++;
352 }
353 }
354 VFREE(q0);
355
356// tik3 = getticks();
357// printf(" tick ratio : %.3f %.3f %.3f\n", elapsed(tik1,tik0)/elapsed(tik3,tik0), elapsed(tik2,tik1)/elapsed(tik3,tik0), elapsed(tik3,tik2)/elapsed(tik3,tik0));
358
359}
360
361
364
367void SH_Xrotate90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm)
368{
369 int lmax= shtns->lmax;
370 if ((shtns->mres != 1) || (shtns->mmax < lmax)) shtns_runerr("truncature makes rotation not closed.");
371
372 if (lmax == 1) {
373 Rlm[0] = Qlm[0]; // l=0 is invariant.
374 int l=1; // rotation matrix for rotX(90), l=1 : m=[0, 1r, 1i]
375 double q0 = creal(Qlm[LiM(shtns, l, 0)]);
376 Rlm[LiM(shtns, l, 0)] = sqrt(2.0) * cimag(Qlm[LiM(shtns, l, 1)]); //[m=0] 0 0 sqrt(2)
377 Rlm[LiM(shtns, l ,1)] = creal(Qlm[LiM(shtns, l, 1)]) - I*(sqrt(0.5)*q0); //[m=1r] 0 1 0
378 return; //[m=1i] -sqrt(2)/2 0 0
379 }
380
381 SH_rotK90(shtns, Qlm, Rlm, 0.0, -M_PI/2);
382}
383
386void SH_Yrotate90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm)
387{
388 int lmax= shtns->lmax;
389 if ((shtns->mres != 1) || (shtns->mmax < lmax)) shtns_runerr("truncature makes rotation not closed.");
390
391 if (lmax == 1) {
392 Rlm[0] = Qlm[0]; // l=0 is invariant.
393 int l=1; // rotation matrix for rotY(90), l=1 : m=[0, 1r, 1i]
394 double q0 = creal(Qlm[LiM(shtns, l, 0)]); //[m=0] 0 sqrt(2) 0
395 Rlm[LiM(shtns, l, 0)] = sqrt(2.0) * creal(Qlm[LiM(shtns, l, 1)]); //[m=1r] -sqrt(2)/2 0 0
396 Rlm[LiM(shtns, l ,1)] = I*cimag(Qlm[LiM(shtns, l, 1)]) - sqrt(0.5) * q0; //[m=1i] 0 0 1
397 return;
398 }
399
400 SH_rotK90(shtns, Qlm, Rlm, -M_PI/2, 0.0);
401}
402
404void SH_Yrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
405{
406 if ((shtns->mres != 1) || (shtns->mmax < shtns->lmax)) shtns_runerr("truncature makes rotation not closed.");
407
408 SH_rotK90(shtns, Qlm, Rlm, 0.0, M_PI/2 + alpha); // Zrotate(pi/2) + Yrotate90 + Zrotate(pi+alpha)
409 SH_rotK90(shtns, Rlm, Rlm, 0.0, M_PI/2); // Yrotate90 + Zrotate(pi/2)
410}
411
413
414
415
420
421
425static void mul_ct_matrix_shifted(shtns_cfg shtns, double* mx)
426{
427 long int im,l,lm;
428 double a_1;
429
430 if (SHT_NORM == sht_schmidt) {
431 lm=0;
432 for (im=0; im<=MMAX; im++) {
433 double* al = alm_im(shtns,im);
434 long int m=im*MRES;
435 a_1 = 1.0 / al[1];
436 l=m;
437 while(++l <= LMAX) {
438 al+=2;
439 mx[2*lm+1] = a_1;
440 a_1 = 1.0 / al[1];
441 mx[2*lm] = -a_1*al[0]; // = -al[2*(lm+1)] / al[2*(lm+1)+1];
442 lm++;
443 }
444 if (l == LMAX+1) { // the last one needs to be computed (used in vector to scalar transform)
445 mx[2*lm+1] = a_1;
446 mx[2*lm] = sqrt((l+m)*(l-m))/(2*l+1); // LMAX+1
447 lm++;
448 }
449 }
450 } else {
451 lm=0;
452 for (im=0; im<=MMAX; im++) {
453 double* al = alm_im(shtns, im);
454 l=im*MRES;
455 while(++l <= LMAX+1) { // compute coeff up to LMAX+1, it fits into the 2*NLM bloc, and is needed for vector <> scalar conversions.
456 a_1 = 1.0 / al[1];
457 mx[2*lm] = a_1; // specific to orthonormal.
458 mx[2*lm+1] = a_1;
459 lm++; al+=2;
460 }
461 }
462 }
463}
464
465static void st_dt_matrix_shifted(shtns_cfg shtns, double* mx)
466{
467 mul_ct_matrix_shifted(shtns, mx);
468 for (int lm=0; lm<NLM; lm++) {
469 mx[2*lm] *= -(shtns->li[lm] + 2); // coeff (l+1)
470 mx[2*lm+1] *= shtns->li[lm]; // coeff (l-1)
471 }
472}
473
474
478void mul_ct_matrix(shtns_cfg shtns, double* mx)
479{
480 long int im,l,lm;
481 double a_1;
482
483 mul_ct_matrix_shifted(shtns, mx);
484 for (int lm=2*NLM-1; lm>0; lm--) mx[lm] = mx[lm-1]; // shift back indices (copy, slow)
485 mx[0] = 0.0;
486 for (int im=1; im<=MMAX; im++) { // remove the coeff for lmax+1 (for backward compatibility)
487 int lm = LiM(shtns, im*MRES, im);
488 mx[2*lm-1] = 0.0; mx[2*lm] = 0.0;
489 }
490 mx[2*NLM-1] = 0.0;
491}
492
496void st_dt_matrix(shtns_cfg shtns, double* mx)
497{
498 mul_ct_matrix(shtns, mx);
499 for (int lm=0; lm<NLM; lm++) {
500 mx[2*lm] *= shtns->li[lm] - 1; // coeff (l-1)
501 mx[2*lm+1] *= -(shtns->li[lm] + 2); // coeff (l+1)
502 }
503}
504
509void SH_mul_mx(shtns_cfg shtns, double* mx, cplx *Qlm, cplx *Rlm)
510{
511 long int nlmlim, lm;
512 v2d* vq = (v2d*) Qlm;
513 v2d* vr = (v2d*) Rlm;
514 nlmlim = NLM-1;
515 lm = 0;
516 s2d mxu = vdup(mx[1]);
517 vr[0] = mxu*vq[1];
518 for (lm=1; lm<nlmlim; lm++) {
519 s2d mxl = vdup(mx[2*lm]); s2d mxu = vdup(mx[2*lm+1]);
520 vr[lm] = mxl*vq[lm-1] + mxu*vq[lm+1];
521 }
522 lm = nlmlim;
523 s2d mxl = vdup(mx[2*lm]);
524 vr[lm] = mxl*vq[lm-1];
525}
526
528
529// truncation at LMAX and MMAX
530#define LTR LMAX
531#define MTR MMAX
532
539
541void SH_to_point_cplx(shtns_cfg shtns, cplx *alm, double cost, double phi, cplx* z_out)
542{
543 double yl[LMAX+1];
544 long int l,m;
545 cplx z = 0.0;
546
547 v2d vr0 = vdup(0.0); v2d vr1 = vdup(0.0);
548 m=0;
549 legendre_sphPlm_array(shtns, LTR, m, cost, &yl[m]);
550
551 int ll = 0;
552 for (l=0; l<LTR; l+=2) {
553 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
554 vr0 += ((v2d*)alm)[ll] * vdup(yl[l]);
555 ll += (l<MMAX) ? 2*l+2 : 2*MMAX+1;
556 vr1 += ((v2d*)alm)[ll] * vdup(yl[l+1]);
557 }
558 if (l==LTR) {
559 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
560 vr0 += ((v2d*)alm)[ll] * vdup(yl[l]);
561 }
562 vr0 += vr1;
563 z = vcplx_real(vr0) + I*vcplx_imag(vr0);
564 if (MTR>0) {
565 cplx eip = cos(MRES*phi) + I*sin(MRES*phi);
566 v2d vrc = vdup(0.0);
567 v2d vrs = vdup(0.0);
568 cplx eimp = eip;
569 for (long im=1; im<=MTR; im++) {
570 const long m = im*MRES;
571 //memset(yl+m, 0xFF, sizeof(double)*(LMAX+1-m));
572 long lnz = legendre_sphPlm_array(shtns, LTR, m, cost, &yl[m]);
573 //if (lnz > m) printf("m=%d, lnz=%d [ %g, %g, %g]\n", m, lnz, yl[lnz-1],yl[lnz],yl[lnz+1]);
574 if (lnz > LTR) break; // nothing else to do
575
576 v2d vrm = vdup(0.0); v2d vim = vdup(0.0);
577 long ll = m*m;
578 long l=m;
579 cplx* almm = alm - 2*m; // m<0
580
581 //for (; l<lnz; l++) ll += (l<=MMAX) ? 2*l : 2*MMAX+1; // skip zeros in yl, replaced by direct computation in next block:
582 if (lnz > m) { // skip zeros in yl:
583 int lx = (lnz <= MMAX) ? lnz : MMAX+1;
584 ll += ((m+lx-1)*(lx-m)); // for (l=m; l<min(lnz,MMAX+1); l++) ll += 2*l;
585 if (lnz > MMAX+1) {
586 ll += (2*MMAX+1)*(lnz-(MMAX+1)); // for (l=MMAX+1; l<lnz; l++) ll += 2*MMAX+1;
587 }
588 l=lnz;
589 }
590 for (; l<=LMAX; l++) { // gather from cplx rep
591 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
592 vim += vdup(yl[l]) * ((v2d*)almm)[ll]; // -m
593 vrm += vdup(yl[l]) * ((v2d*)alm)[ll]; // +m
594 }
595 //cplx eimp = cos(m*phi) + I*sin(m*phi); // we need something accurate here.
596 if (m&1) vim = -vim; // m<0, m odd
597 //vrc += vdup(cos(m*phi)) * (vrm+vim); // error @lmax=1023 = 2e-7
598 //vrs += vdup(sin(m*phi)) * (vrm-vim);
599 vrc += vdup(creal(eimp)) * (vrm+vim); // error @lmax=1023 = 2e-9
600 vrs += vdup(cimag(eimp)) * (vrm-vim);
601 eimp *= eip;
602 }
603 z += vcplx_real(vrc) - vcplx_imag(vrs) + I*(vcplx_imag(vrc) + vcplx_real(vrs));
604 }
605 *z_out = z;
606}
607
608
610double SH_to_point(shtns_cfg shtns, cplx *Qlm, double cost, double phi)
611{
612 double yl[LMAX+1];
613 double vr0, vr1;
614 long int l,m,im;
615
616 vr0 = 0.0; vr1 = 0.0;
617 m=0; im=0;
618 legendre_sphPlm_array(shtns, LTR, im, cost, &yl[m]);
619 for (l=m; l<LTR; l+=2) {
620 vr0 += yl[l] * creal( Qlm[l] );
621 vr1 += yl[l+1] * creal( Qlm[l+1] );
622 }
623 if (l==LTR) {
624 vr0 += yl[l] * creal( Qlm[l] );
625 }
626 vr0 += vr1;
627
628 for (im=1; im<=MTR; im++) {
629 m = im*MRES;
630 long lnz = legendre_sphPlm_array(shtns, LTR, im, cost, &yl[m]);
631 if (lnz > LTR) break; // nothing else to do
632
633 v2d* Ql = (v2d*) &Qlm[LiM(shtns, 0,im)]; // virtual pointer for l=0 and im
634 v2d vrm0 = vdup(0.0); v2d vrm1 = vdup(0.0);
635 for (l=lnz; l<LTR; l+=2) {
636 vrm0 += vdup(yl[l]) * Ql[l];
637 vrm1 += vdup(yl[l+1]) * Ql[l+1];
638 }
639 cplx eimp = 2.*(cos(m*phi) + I*sin(m*phi)); // we need something accurate here.
640 vrm0 += vrm1;
641 if (l==LTR) {
642 vrm0 += vdup(yl[l]) * Ql[l];
643 }
644 vr0 += vcplx_real(vrm0)*creal(eimp) - vcplx_imag(vrm0)*cimag(eimp);
645 }
646 return vr0;
647}
648
649void SH_to_grad_point(shtns_cfg shtns, cplx *DrSlm, cplx *Slm, double cost, double phi,
650 double *gr, double *gt, double *gp)
651{
652 double yl[LMAX+1];
653 double dtyl[LMAX+1];
654 double vtt, vpp, vr0, vrm;
655 long int l,m,im;
656
657 const double sint = sqrt((1.-cost)*(1.+cost));
658 vtt = 0.; vpp = 0.; vr0 = 0.; vrm = 0.;
659 m=0; im=0;
660 legendre_sphPlm_deriv_array(shtns, LTR, im, cost, sint, &yl[m], &dtyl[m]);
661 for (l=m; l<=LTR; ++l) {
662 vr0 += yl[l] * creal( DrSlm[l] );
663 vtt += dtyl[l] * creal( Slm[l] );
664 }
665 if (MTR>0) {
666 im=1; do {
667 m = im*MRES;
668 long lnz = legendre_sphPlm_deriv_array(shtns, LTR, im, cost, sint, &yl[m], &dtyl[m]);
669 if (lnz > LTR) break; // nothing else to do
670
671 cplx eimp = 2.*(cos(m*phi) + I*sin(m*phi));
672 cplx imeimp = eimp*m*I;
673 l = LiM(shtns, 0,im);
674 v2d* Ql = (v2d*) &DrSlm[l]; v2d* Sl = (v2d*) &Slm[l];
675 v2d qm = vdup(0.0);
676 v2d dsdt = vdup(0.0); v2d dsdp = vdup(0.0);
677 for (l=lnz; l<=LTR; ++l) {
678 qm += vdup(yl[l]) * Ql[l];
679 dsdt += vdup(dtyl[l]) * Sl[l];
680 dsdp += vdup(yl[l]) * Sl[l];
681 }
682 vrm += vcplx_real(qm)*creal(eimp) - vcplx_imag(qm)*cimag(eimp); // dS/dr
683 vtt += vcplx_real(dsdt)*creal(eimp) - vcplx_imag(dsdt)*cimag(eimp); // dS/dt
684 vpp += vcplx_real(dsdp)*creal(imeimp) - vcplx_imag(dsdp)*cimag(imeimp); // + I.m/sint *S
685 } while (++im <= MTR);
686 vr0 += vrm*sint;
687 }
688 *gr = vr0; // Gr = dS/dr
689 *gt = vtt; // Gt = dS/dt
690 *gp = vpp; // Gp = I.m/sint *S
691}
692
694void SHqst_to_point(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double cost, double phi,
695 double *vr, double *vt, double *vp)
696{
697 double yl[LMAX+1];
698 double dtyl[LMAX+1];
699 double vtt, vpp, vr0, vrm;
700 long int l,m,im;
701
702 const double sint = sqrt((1.-cost)*(1.+cost));
703 vtt = 0.; vpp = 0.; vr0 = 0.; vrm = 0.;
704 m=0; im=0;
705 legendre_sphPlm_deriv_array(shtns, LTR, im, cost, sint, &yl[m], &dtyl[m]);
706 for (l=m; l<=LTR; ++l) {
707 vr0 += yl[l] * creal( Qlm[l] );
708 vtt += dtyl[l] * creal( Slm[l] );
709 vpp -= dtyl[l] * creal( Tlm[l] );
710 }
711 if (MTR>0) {
712 im=1; do {
713 m = im*MRES;
714 legendre_sphPlm_deriv_array(shtns, LTR, im, cost, sint, &yl[m], &dtyl[m]);
715 cplx eimp = 2.*(cos(m*phi) + I*sin(m*phi));
716 cplx imeimp = eimp*m*I;
717 l = LiM(shtns, 0,im);
718 v2d* Ql = (v2d*) &Qlm[l]; v2d* Sl = (v2d*) &Slm[l]; v2d* Tl = (v2d*) &Tlm[l];
719 v2d qm = vdup(0.0);
720 v2d dsdt = vdup(0.0); v2d dtdt = vdup(0.0);
721 v2d dsdp = vdup(0.0); v2d dtdp = vdup(0.0);
722 for (l=m; l<=LTR; ++l) {
723 qm += vdup(yl[l]) * Ql[l];
724 dsdt += vdup(dtyl[l]) * Sl[l];
725 dtdt += vdup(dtyl[l]) * Tl[l];
726 dsdp += vdup(yl[l]) * Sl[l];
727 dtdp += vdup(yl[l]) * Tl[l];
728 }
729 vrm += vcplx_real(qm)*creal(eimp) - vcplx_imag(qm)*cimag(eimp);
730 vtt += (vcplx_real(dtdp)*creal(imeimp) - vcplx_imag(dtdp)*cimag(imeimp)) // + I.m/sint *T
731 + (vcplx_real(dsdt)*creal(eimp) - vcplx_imag(dsdt)*cimag(eimp)); // + dS/dt
732 vpp += (vcplx_real(dsdp)*creal(imeimp) - vcplx_imag(dsdp)*cimag(imeimp)) // + I.m/sint *S
733 - (vcplx_real(dtdt)*creal(eimp) - vcplx_imag(dtdt)*cimag(eimp)); // - dT/dt
734 } while (++im <= MTR);
735 vr0 += vrm*sint;
736 }
737 *vr = vr0;
738 *vt = vtt; // Bt = I.m/sint *T + dS/dt
739 *vp = vpp; // Bp = I.m/sint *S - dT/dt
740}
742
743#undef LTR
744#undef MTR
745
746
747/*
748 SYNTHESIS AT A GIVEN LATITUDE
749 (does not require a previous call to shtns_set_grid)
750*/
751
757void SHqst_to_lat(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double cost,
758 double *vr, double *vt, double *vp, int nphi, int ltr, int mtr)
759{
760 cplx vst, vtt, vsp, vtp, vrr;
761 cplx *vrc, *vtc, *vpc;
762 double* ylm_lat;
763 double* dylm_lat;
764 double st_lat;
765
766 if (ltr > LMAX) ltr=LMAX;
767 if (mtr > MMAX) mtr=MMAX;
768 if (mtr*MRES > ltr) mtr=ltr/MRES;
769 if (mtr*2*MRES >= nphi) mtr = (nphi-1)/(2*MRES);
770
771 ylm_lat = shtns->ylm_lat;
772 if (ylm_lat == NULL) { // alloc memory for Legendre functions ?
773 ylm_lat = (double *) malloc(sizeof(double)* 2*NLM);
774 shtns->ylm_lat = ylm_lat;
775 }
776 dylm_lat = ylm_lat + NLM;
777
778 st_lat = sqrt((1.-cost)*(1.+cost)); // sin(theta)
779 if (cost != shtns->ct_lat) { // compute Legendre functions ?
780 shtns->ct_lat = cost;
781 for (int m=0,j=0; m<=mtr; ++m) {
782 legendre_sphPlm_deriv_array(shtns, ltr, m, cost, st_lat, &ylm_lat[j], &dylm_lat[j]);
783 j += LMAX -m*MRES +1;
784 }
785 }
786
787 vrc = (cplx*) fftw_malloc(sizeof(double) * 3*(nphi+2));
788 vtc = vrc + (nphi/2+1);
789 vpc = vtc + (nphi/2+1);
790
791 if (nphi != shtns->nphi_lat) { // compute FFTW plan ?
792 if (shtns->ifft_lat) fftw_destroy_plan(shtns->ifft_lat);
793 #ifdef OMP_FFTW
794 fftw_plan_with_nthreads(1);
795 #endif
796 shtns->ifft_lat = fftw_plan_dft_c2r_1d(nphi, vrc, vr, FFTW_ESTIMATE);
797 shtns->nphi_lat = nphi;
798 }
799
800 for (int m = 0; m<nphi/2+1; ++m) { // init with zeros
801 vrc[m] = 0.0; vtc[m] = 0.0; vpc[m] = 0.0;
802 }
803 long j=0;
804 int m=0;
805 vrr=0; vtt=0; vst=0;
806 for(int l=m; l<=ltr; ++l, ++j) {
807 vrr += ylm_lat[j] * creal(Qlm[j]);
808 vst += dylm_lat[j] * creal(Slm[j]);
809 vtt += dylm_lat[j] * creal(Tlm[j]);
810 }
811 j += (LMAX-ltr);
812 vrc[m] = vrr;
813 vtc[m] = vst; // Vt = dS/dt
814 vpc[m] = -vtt; // Vp = - dT/dt
815 for (int m=MRES; m<=mtr*MRES; m+=MRES) {
816 vrr=0; vtt=0; vst=0; vsp=0; vtp=0;
817 for(int l=m; l<=ltr; ++l, ++j) {
818 vrr += ylm_lat[j] * Qlm[j];
819 vst += dylm_lat[j] * Slm[j];
820 vtt += dylm_lat[j] * Tlm[j];
821 vsp += ylm_lat[j] * Slm[j];
822 vtp += ylm_lat[j] * Tlm[j];
823 }
824 j+=(LMAX-ltr);
825 vrc[m] = vrr*st_lat;
826 vtc[m] = I*m*vtp + vst; // Vt = I.m/sint *T + dS/dt
827 vpc[m] = I*m*vsp - vtt; // Vp = I.m/sint *S - dT/dt
828 }
829 fftw_execute_dft_c2r(shtns->ifft_lat,vrc,vr);
830 fftw_execute_dft_c2r(shtns->ifft_lat,vtc,vt);
831 fftw_execute_dft_c2r(shtns->ifft_lat,vpc,vp);
832 fftw_free(vrc);
833}
834
840void SH_to_lat(shtns_cfg shtns, cplx *Qlm, double cost,
841 double *vr, int nphi, int ltr, int mtr)
842{
843 cplx vrr;
844 cplx *vrc;
845 double* ylm_lat;
846 double* dylm_lat;
847 double st_lat;
848
849 if (ltr > LMAX) ltr=LMAX;
850 if (mtr > MMAX) mtr=MMAX;
851 if (mtr*MRES > ltr) mtr=ltr/MRES;
852 if (mtr*2*MRES >= nphi) mtr = (nphi-1)/(2*MRES);
853
854 ylm_lat = shtns->ylm_lat;
855 if (ylm_lat == NULL) {
856 ylm_lat = (double *) malloc(sizeof(double)* 2*NLM);
857 shtns->ylm_lat = ylm_lat;
858 }
859 dylm_lat = ylm_lat + NLM;
860
861 st_lat = sqrt((1.-cost)*(1.+cost)); // sin(theta)
862 if (cost != shtns->ct_lat) {
863 shtns->ct_lat = cost;
864 for (int m=0,j=0; m<=mtr; ++m) {
865 legendre_sphPlm_deriv_array(shtns, ltr, m, cost, st_lat, &ylm_lat[j], &dylm_lat[j]);
866 j += LMAX -m*MRES +1;
867 }
868 }
869
870 vrc = (cplx*) fftw_malloc(sizeof(double) * (nphi+2));
871
872 if (nphi != shtns->nphi_lat) {
873 if (shtns->ifft_lat) fftw_destroy_plan(shtns->ifft_lat);
874 #ifdef OMP_FFTW
875 fftw_plan_with_nthreads(1);
876 #endif
877 shtns->ifft_lat = fftw_plan_dft_c2r_1d(nphi, vrc, vr, FFTW_ESTIMATE);
878 shtns->nphi_lat = nphi;
879 }
880
881 for (int m = 0; m<nphi/2+1; ++m) { // init with zeros
882 vrc[m] = 0.0;
883 }
884 long j=0;
885 int m=0;
886 vrr=0;
887 for(int l=m; l<=ltr; ++l, ++j) {
888 vrr += ylm_lat[j] * creal(Qlm[j]);
889 }
890 j += (LMAX-ltr);
891 vrc[m] = vrr;
892 for (int m=MRES; m<=mtr*MRES; m+=MRES) {
893 vrr=0;
894 for(int l=m; l<=ltr; ++l, ++j) {
895 vrr += ylm_lat[j] * Qlm[j];
896 }
897 j+=(LMAX-ltr);
898 vrc[m] = vrr*st_lat;
899 }
900 fftw_execute_dft_c2r(shtns->ifft_lat,vrc,vr);
901 fftw_free(vrc);
902}
903
904
905
906
907// SPAT_CPLX transform indexing scheme:
908// if (l<=MMAX) : l*(l+1) + m
909// if (l>=MMAX) : l*(2*mmax+1) - mmax*mmax + m = mmax*(2*l-mmax) + l+m
911void SH_2real_to_cplx(shtns_cfg shtns, cplx* Rlm, cplx* Ilm, cplx* Zlm)
912{
913 // combine into complex coefficients
914 unsigned ll = 0;
915 unsigned lm = 0;
916 for (unsigned l=0; l<=LMAX; l++) {
917 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
918 Zlm[ll] = creal(Rlm[lm]) + I*creal(Ilm[lm]); // m=0
919 lm++;
920 }
921 for (unsigned m=1; m<=MMAX; m++) {
922 ll = (m-1)*m;
923 for (unsigned l=m; l<=LMAX; l++) {
924 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
925 cplx rr = Rlm[lm];
926 cplx ii = Ilm[lm];
927 Zlm[ll+m] = rr + I*ii; // m>0
928 rr = conj(rr) + I*conj(ii); // m<0, m even
929 if (m&1) rr = -rr; // m<0, m odd
930 Zlm[ll-m] = rr;
931 lm++;
932 }
933 }
934}
935
937void SH_cplx_to_2real(shtns_cfg shtns, cplx* Zlm, cplx* Rlm, cplx* Ilm)
938{
939 // extract complex coefficients corresponding to real and imag
940 unsigned ll = 0;
941 unsigned lm = 0;
942 for (unsigned l=0; l<=LMAX; l++) {
943 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
944 Rlm[lm] = creal(Zlm[ll]); // m=0
945 Ilm[lm] = cimag(Zlm[ll]);
946 lm++;
947 }
948 double half_parity = 0.5;
949 for (unsigned m=1; m<=MMAX; m++) {
950 ll = (m-1)*m;
951 half_parity = -half_parity; // (-1)^m * 0.5
952 for (unsigned l=m; l<=LMAX; l++) {
953 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
954 cplx b = Zlm[ll-m] * half_parity; // (-1)^m for m negative.
955 cplx a = Zlm[ll+m] * 0.5;
956 Rlm[lm] = (conj(b) + a); // real part
957 Ilm[lm] = (conj(b) - a)*I; // imag part
958 lm++;
959 }
960 }
961}
962
963
968void spat_cplx_to_SH(shtns_cfg shtns, cplx *z, cplx *alm)
969{
970 const long int nspat = shtns->nspat;
971 cplx *rlm, *ilm, *Q, *mem;
972
973 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
974
975 // alloc temporary fields
976 mem = (cplx*) VMALLOC( (nspat+2*NLM)*sizeof(cplx) );
977 rlm = mem + nspat;
978 ilm = rlm + NLM;
979
980 Q = z;
981 if (NPHI>1) {
982 if (shtns->fft_mode & FFT_OOP) Q = mem; // out-of-place transform
983 fftw_execute_dft(shtns->fft_cplx, z, Q);
984 }
985
986 const double norm = 1.0/NPHI;
987 #pragma omp parallel for schedule(static,1) num_threads(shtns->nthreads)
988 for (int m=0; m<=MMAX; m++) {
989 if (m==0) { // m=0
990 spat_to_SH_ml(shtns, 0, Q, rlm, LMAX); // real
991 spat_to_SH_ml(shtns, 0, (cplx*)(((double*)Q)+1), ilm, LMAX); // imag
992 int lm = 0;
993 int ll = 0;
994 for (int l=0; l<=LMAX; l++) {
995 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
996 alm[ll] = (creal(rlm[lm]) + I*creal(ilm[lm]))*norm; // m=0
997 lm++;
998 }
999 } else {
1000 long lm = LM(shtns,m,m);
1001 spat_to_SH_ml(shtns, m, Q + (NPHI-m)*NLAT, rlm + lm, LMAX); // m>0
1002 spat_to_SH_ml(shtns, m, Q + m*NLAT, ilm + lm, LMAX); // m<0
1003 int ll = (m-1)*m;
1004 for (int l=m; l<=LMAX; l++) {
1005 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1006 cplx rr = rlm[lm]; // +m
1007 cplx ii = ilm[lm]; // -m
1008 alm[ll+m] = rr*norm;
1009 if (m&1) ii = -ii; // m<0, m odd
1010 alm[ll-m] = ii*norm;
1011 lm++;
1012 }
1013 }
1014 }
1015
1016 VFREE(mem);
1017}
1018
1023void SH_to_spat_cplx(shtns_cfg shtns, cplx *alm, cplx *z)
1024{
1025 const long int nspat = shtns->nspat;
1026 cplx *rlm, *ilm, *Q, *mem;
1027
1028 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1029
1030 // alloc temporary fields
1031 mem = (cplx*) VMALLOC( 2*(nspat + NLM*2)*sizeof(double) );
1032 rlm = mem + nspat;
1033 ilm = rlm + NLM;
1034
1035 Q = z;
1036 if ((NPHI>1) && (shtns->fft_mode & FFT_OOP)) Q = mem; // out-of-place transform
1037
1038 #pragma omp parallel for schedule(static,1) num_threads(shtns->nthreads)
1039 for (int m=0; m<=MMAX; m++) {
1040 if (m==0) { // m=0
1041 int lm = 0;
1042 int ll = 0;
1043 for (int l=0; l<=LMAX; l++) {
1044 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1045 rlm[lm] = creal(alm[ll]);
1046 ilm[lm] = cimag(alm[ll]);
1047 lm++;
1048 }
1049 cplx tmp[NLAT] SSE;
1050 SH_to_spat_ml(shtns, 0, rlm, Q, LMAX);
1051 SH_to_spat_ml(shtns, 0, ilm, tmp, LMAX);
1052 for (int it=0; it<NLAT; it++) {
1053 ((double*)Q)[2*it+1] = creal(tmp[it]); // copy imaginary part to destination array.
1054 }
1055 // fill m>MMAX with zeros!
1056 for (long i=(MMAX+1)*NLAT; i<(NPHI-MMAX)*NLAT; i++) Q[i] = 0.0;
1057 } else {
1058 long lm = LM(shtns,m,m);
1059 int ll = m*m;
1060 cplx* almm = alm - 2*m;
1061 for (int l=m; l<=LMAX; l++) { // gather from cplx rep
1062 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1063 cplx rr = alm[ll]; // +m
1064 cplx ii = almm[ll]; // -m
1065 if (m&1) ii = -ii; // m<0, m odd
1066 rlm[lm] = rr;
1067 ilm[lm] = ii;
1068 lm++;
1069 }
1070 lm = LM(shtns,m,m);
1071 SH_to_spat_ml(shtns, m, rlm + lm, Q + m*NLAT, LMAX); // m>0
1072 SH_to_spat_ml(shtns, m, ilm + lm, Q + (NPHI-m)*NLAT, LMAX); // m<0
1073 }
1074 }
1075
1076 if (NPHI>1) fftw_execute_dft(shtns->ifft_cplx, Q, z);
1077 VFREE(mem);
1078}
1079
1080
1081
1085void SHsphtor_to_spat_cplx(shtns_cfg shtns, cplx *slm, cplx *tlm, cplx *zt, cplx *zp)
1086{
1087 const long int nspat = shtns->nspat;
1088 cplx *zzt, *zzp, *mem, *stlm;
1089
1090 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1091
1092 // alloc temporary fields
1093 mem = (cplx*) VMALLOC( 4*(nspat + NLM*2)*sizeof(double) );
1094 stlm = mem + 2*nspat;
1095
1096 zzt = zt; zzp = zp;
1097 if ((NPHI>1) && (shtns->fft_mode & FFT_OOP)) { zzt = mem; zzp = mem + nspat; } // out-of-place transform
1098
1099 #pragma omp parallel for schedule(static,1) num_threads(shtns->nthreads)
1100 for (int m=0; m<=MMAX; m++) {
1101 const long stride = LMAX+1-m;
1102 if (m==0) { // m=0
1103 int lm = 0;
1104 int ll = 0;
1105 for (int l=0; l<=LMAX; l++) { // gather from cplx rep
1106 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1107 stlm[lm] = creal(slm[ll]);
1108 stlm[lm+2*stride] = cimag(slm[ll]);
1109 stlm[lm+stride] = creal(tlm[ll]);
1110 stlm[lm+3*stride] = cimag(tlm[ll]);
1111 lm++;
1112 }
1113 cplx tt[NLAT] SSE;
1114 cplx pp[NLAT] SSE;
1115 SHsphtor_to_spat_ml(shtns, 0, stlm, stlm+stride, zzt, zzp, LMAX);
1116 SHsphtor_to_spat_ml(shtns, 0, stlm+2*stride, stlm+3*stride, tt, pp, LMAX);
1117 for (int it=0; it<NLAT; it++) {
1118 ((double*)zzt)[2*it+1] = creal(tt[it]); // copy imaginary part to destination array.
1119 ((double*)zzp)[2*it+1] = creal(pp[it]);
1120 }
1121 // fill m>MMAX with zeros!
1122 for (long i=(MMAX+1)*NLAT; i<(NPHI-MMAX)*NLAT; i++) {
1123 zzt[i] = 0.0; zzp[i] = 0.0;
1124 }
1125 } else {
1126 long lm = 4 * LM(shtns,m,m);
1127 int ll = (m-1)*m;
1128 for (int l=m; l<=LMAX; l++) { // gather from cplx rep
1129 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1130 cplx sr = slm[ll+m]; // +m
1131 cplx si = slm[ll-m]; // -m
1132 cplx tr = tlm[ll+m]; // +m
1133 cplx ti = tlm[ll-m]; // -m
1134 if (m&1) { // m<0, m odd
1135 si = -si;
1136 ti = -ti;
1137 }
1138 stlm[lm] = sr; stlm[lm+2*stride] = si;
1139 stlm[lm+stride] = tr; stlm[lm+3*stride] = ti;
1140 lm++;
1141 }
1142 lm = 4 * LM(shtns,m,m);
1143 SHsphtor_to_spat_ml(shtns, m, stlm+lm, stlm+stride+lm, zzt + m*NLAT, zzp + m*NLAT, LMAX); // m>0
1144 SHsphtor_to_spat_ml(shtns, -m, stlm+2*stride+lm, stlm+3*stride+lm, zzt + (NPHI-m)*NLAT, zzp + (NPHI-m)*NLAT, LMAX); // m<0
1145 }
1146 }
1147
1148 if (NPHI>1) {
1149 fftw_execute_dft(shtns->ifft_cplx, zzt, zt);
1150 fftw_execute_dft(shtns->ifft_cplx, zzp, zp);
1151 }
1152 VFREE(mem);
1153}
1154
1155
1160void spat_cplx_to_SHsphtor(shtns_cfg shtns, cplx *zt, cplx *zp, cplx *slm, cplx *tlm)
1161{
1162 const long int nspat = shtns->nspat;
1163 cplx *zzt, *zzp, *mem, *stlm;
1164
1165 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1166
1167 // alloc temporary fields
1168 mem = (cplx*) VMALLOC( (2*nspat + NLM*4)*sizeof(cplx) );
1169 stlm = mem + 2*nspat;
1170
1171 zzt = zt; zzp = zp;
1172 if (NPHI>1) {
1173 if (shtns->fft_mode & FFT_OOP) {
1174 zzt = mem; zzp = mem + nspat; // out-of-place transform
1175 }
1176 fftw_execute_dft(shtns->fft_cplx, zt, zzt);
1177 fftw_execute_dft(shtns->fft_cplx, zp, zzp);
1178 }
1179
1180 const double norm = 1.0/NPHI;
1181 #pragma omp parallel for schedule(static,1) num_threads(shtns->nthreads)
1182 for (int m=0; m<=MMAX; m++) {
1183 const long stride = LMAX+1-m;
1184 if (m==0) { // m=0
1185 spat_to_SHsphtor_ml(shtns, 0, zzt, zzp, stlm, stlm+stride, LMAX); // real
1186 spat_to_SHsphtor_ml(shtns, 0, (cplx*)(((double*)zzt)+1), (cplx*)(((double*)zzp)+1), stlm+2*stride, stlm+3*stride, LMAX); // imag
1187 int lm = 0;
1188 int ll = 0;
1189 for (int l=0; l<=LMAX; l++) {
1190 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1191 slm[ll] = (creal(stlm[lm]) + I*creal(stlm[lm+2*stride]))*norm; // m=0
1192 tlm[ll] = (creal(stlm[lm+stride]) + I*creal(stlm[lm+3*stride]))*norm; // m=0
1193 lm++;
1194 }
1195 } else {
1196 long lm = 4 * LM(shtns,m,m);
1197 spat_to_SHsphtor_ml(shtns, m, zzt + (NPHI-m)*NLAT, zzp + (NPHI-m)*NLAT, stlm+lm, stlm+lm+stride, LMAX); // m>0
1198 spat_to_SHsphtor_ml(shtns, -m, zzt + m*NLAT, zzp + m*NLAT, stlm+lm+2*stride, stlm+lm+3*stride, LMAX); // m<0
1199 int ll = (m-1)*m;
1200 for (int l=m; l<=LMAX; l++) {
1201 ll += (l<=MMAX) ? 2*l : 2*MMAX+1;
1202 cplx sr = stlm[lm]; // +m
1203 cplx tr = stlm[lm+stride]; // +m
1204 cplx si = stlm[lm+2*stride]; // -m
1205 cplx ti = stlm[lm+3*stride]; // -m
1206 slm[ll+m] = sr*norm;
1207 tlm[ll+m] = tr*norm;
1208 if (m&1) { si = -si; ti = -ti; } // m<0, m odd
1209 slm[ll-m] = si*norm;
1210 tlm[ll-m] = ti*norm;
1211 lm++;
1212 }
1213 }
1214 }
1215
1216 VFREE(mem);
1217}
1218
1222void spat_cplx_to_SHqst(shtns_cfg shtns, cplx *zr, cplx *zt, cplx *zp, cplx *qlm, cplx *slm, cplx *tlm)
1223{
1224 spat_cplx_to_SH(shtns, zr, qlm);
1225 spat_cplx_to_SHsphtor(shtns, zt,zp, slm,tlm);
1226}
1227
1231void SHqst_to_spat_cplx(shtns_cfg shtns, cplx *qlm, cplx *slm, cplx *tlm, cplx *zr, cplx *zt, cplx *zp)
1232{
1233 SH_to_spat_cplx(shtns, qlm, zr);
1234 SHsphtor_to_spat_cplx(shtns, slm,tlm, zt,zp);
1235}
1236
1237
1238
1239void SH_cplx_Xrotate90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm)
1240{
1241 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1242
1243 // alloc temporary fields
1244 cplx* rlm = (cplx*) VMALLOC( NLM*2*sizeof(cplx) );
1245 cplx* ilm = rlm + NLM;
1246
1247 // extract complex coefficients corresponding to real and imag
1248 SH_cplx_to_2real(shtns, Qlm, rlm, ilm);
1249
1250 // perform two real rotations:
1251 SH_Xrotate90(shtns, rlm, rlm);
1252 SH_Xrotate90(shtns, ilm, ilm);
1253
1254 // combine back into complex coefficients
1255 SH_2real_to_cplx(shtns, rlm, ilm, Rlm);
1256
1257 VFREE(rlm);
1258}
1259
1260void SH_cplx_Yrotate90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm)
1261{
1262 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1263
1264 // alloc temporary fields
1265 cplx* rlm = (cplx*) VMALLOC( NLM*2*sizeof(cplx) );
1266 cplx* ilm = rlm + NLM;
1267
1268 // extract complex coefficients corresponding to real and imag
1269 SH_cplx_to_2real(shtns, Qlm, rlm, ilm);
1270
1271 // perform two real rotations:
1272 SH_Yrotate90(shtns, rlm, rlm);
1273 SH_Yrotate90(shtns, ilm, ilm);
1274
1275 // combine back into complex coefficients
1276 SH_2real_to_cplx(shtns, rlm, ilm, Rlm);
1277
1278 VFREE(rlm);
1279}
1280
1284void SH_cplx_Yrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
1285{
1286 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1287
1288 // alloc temporary fields
1289 cplx* rlm = (cplx*) VMALLOC( NLM*2*sizeof(cplx) );
1290 cplx* ilm = rlm + NLM;
1291
1292 // extract complex coefficients corresponding to real and imag
1293 SH_cplx_to_2real(shtns, Qlm, rlm, ilm);
1294
1295 // perform two real rotations:
1296 SH_Yrotate(shtns, rlm, alpha, rlm);
1297 SH_Yrotate(shtns, ilm, alpha, ilm);
1298
1299 // combine back into complex coefficients
1300 SH_2real_to_cplx(shtns, rlm, ilm, Rlm);
1301
1302 VFREE(rlm);
1303}
1304
1308void SH_cplx_Zrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
1309{
1310 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1311
1312 // alloc temporary fields
1313 cplx* rlm = (cplx*) VMALLOC( NLM*2*sizeof(cplx) );
1314 cplx* ilm = rlm + NLM;
1315
1316 // extract complex coefficients corresponding to real and imag
1317 SH_cplx_to_2real(shtns, Qlm, rlm, ilm);
1318
1319 // perform two real rotations:
1320 SH_Zrotate(shtns, rlm, alpha, rlm);
1321 SH_Zrotate(shtns, ilm, alpha, ilm);
1322
1323 // combine back into complex coefficients
1324 SH_2real_to_cplx(shtns, rlm, ilm, Rlm);
1325
1326 VFREE(rlm);
1327}
1328
1329/*
1333void SH_cplx_Zrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
1334{
1335 if (MRES != 1) shtns_runerr("complex SH requires mres=1.");
1336
1337 cplx* eima = (cplx*) VMALLOC( (2*MMAX+1)*sizeof(cplx) );
1338 eima += MMAX;
1339 eima[0] = 1.0;
1340 for (int m=1; m<=MMAX; m++) { // precompute the complex numbers
1341 double cma = cos(m*alpha);
1342 double sma = sin(m*alpha);
1343 eima[m] = cma - I*sma;
1344 eima[-m] = cma + I*sma;
1345 }
1346
1347 unsigned ll=0;
1348 for (unsigned l=0; l<=MMAX; l++) {
1349 for (int m=-l; m<=l; m++) {
1350 Rlm[ll] = Qlm[ll] * eima[m];
1351 ll++;
1352 }
1353 }
1354 for (unsigned l=MMAX+1; l<=LMAX; l++) {
1355 for (int m=-MMAX; m<=MMAX; m++) {
1356 Rlm[ll] = Qlm[ll] * eima[m];
1357 ll++;
1358 }
1359 }
1360
1361 VFREE(eima);
1362}
1363*/
1364
1365/*
1366void SH_to_spat_grad(shtns_cfg shtns, cplx *alm, double *gt, double *gp)
1367{
1368 double *mx;
1369 cplx *blm, *clm;
1370
1371 blm = (cplx*) VMALLOC( 3*NLM*sizeof(cplx) );
1372 clm = blm + NLM;
1373 mx = (double*)(clm + NLM);
1374
1375 st_dt_matrix(shtns, mx);
1376 SH_mul_mx(shtns, mx, alm, blm);
1377 int lm=0;
1378 for (int im=0; im<=MMAX; im++) {
1379 int m = im*MRES;
1380 for (int l=m; l<=LMAX; l++) {
1381 clm[lm] = alm[lm] * I*m;
1382 lm++;
1383 }
1384 }
1385 SH_to_spat(shtns, blm, gt);
1386 SH_to_spat(shtns, clm, gp);
1387 for (int ip=0; ip<NPHI; ip++) {
1388 for (int it=0; it<NLAT; it++) {
1389 gt[ip*NLAT+it] /= shtns->st[it];
1390 gp[ip*NLAT+it] /= shtns->st[it];
1391 }
1392 }
1393 VFREE(blm);
1394}
1395*/
1396
1397#ifdef _GCC_VEC_
1398typedef double rndu __attribute__ ((vector_size (VSIZE2*8), aligned (8)));
1399typedef double v2du __attribute__ ((vector_size (16), aligned (8)));
1400#else
1401typedef rnd rndu;
1402typedef v2d v2du;
1403#endif
1404
1416
1419shtns_rot shtns_rotation_create(const int lmax, const int mmax, int norm)
1420{
1421 shtns_rot r = (shtns_rot) malloc(sizeof(struct shtns_rot_));
1422 r->lmax = lmax;
1423 r->mmax = mmax;
1424 r->no_cs_phase = (norm & SHT_NO_CS_PHASE) ? -1. : 1.; // adapt rotations to Condon-Shortley phase
1425 r->m0_renorm = (norm & SHT_REAL_NORM) ? sqrt(2.) : 1.; // adapt to real norm
1426 r->plm_beta = (double*) malloc( sizeof(double) * nlm_calc(lmax+1, lmax+1, 1) );
1427 r->sht = shtns_create(lmax+1, lmax+1, 1, sht_for_rotations | SHT_NO_CS_PHASE); // need SH up to lmax+1, with Schmidt semi-normalization.
1428 r->alpha = 0.0;
1429 r->beta = 0.0;
1430 r->gamma = 0.0;
1431 r->eia = 0; r->eig = 0;
1432 r->flag_alpha_gamma = 0; // mark alpha and gamma as zero
1433 return r;
1434}
1435
1437{
1438 if (r) {
1439 shtns_destroy(r->sht);
1440 if (r->plm_beta) free(r->plm_beta);
1441 free(r);
1442 }
1443}
1444
1446static cplx special_eiphi(const double phi)
1447{
1448 cplx eip;
1449 if (phi == 0.0) {
1450 eip = 1.0;
1451 } else if (phi == M_PI) {
1452 eip = -1.0;
1453 } else if (phi == M_PI/2) {
1454 eip = I;
1455 } else if ((phi == 3*M_PI/2) || (phi == -M_PI/2)) {
1456 eip = -I;
1457 } else if (phi == M_PI/4) {
1458 eip = sqrt(0.5) + I*sqrt(0.5);
1459 } else if (phi == M_PI/3) {
1460 eip = 0.5 + I*sqrt(3)*0.5;
1461 } else {
1462 eip = cos(phi) + I*sin(phi);
1463 }
1464 return eip;
1465}
1466
1468void shtns_rotation_set_angles_ZYZ(shtns_rot r, double alpha, double beta, double gamma)
1469{
1470 beta *= r->no_cs_phase; // condon-shortley phase is the same thing as rotating by 180°, or changing sign of beta.
1471 if UNLIKELY(fabs(beta) > M_PI) {
1472 printf("ERROR: angle 'beta' must be between -pi and pi\n");
1473 exit(1);
1474 }
1475 if (beta < 0.0) { // translate to beta>0 as beta<0 is not supported.
1476 alpha = (alpha>0) ? alpha-M_PI : alpha+M_PI; // rotate by 180°
1477 beta = fabs(beta);
1478 gamma = (gamma>0) ? gamma-M_PI : gamma+M_PI; // rotate by 180°
1479 } else if (beta == 0.0) {
1480 alpha += gamma;
1481 gamma = 0.0;
1482 }
1483
1484 // step 0 : compute plm(beta)
1485 const cplx eib = special_eiphi(beta);
1486 r->cos_beta = creal(eib);
1487 r->sin_beta = cimag(eib);
1488 r->eia = special_eiphi(-alpha);
1489 r->eig = special_eiphi(-gamma);
1490 r->alpha = alpha;
1491 r->beta = beta;
1492 r->gamma = gamma;
1493 r->flag_alpha_gamma = (alpha != 0) + 2*(gamma != 0);
1494 if (beta != 0.0) {
1495 const int lmax = r->lmax + 1; // need SH up to lmax+1.
1496 #pragma omp parallel for schedule(dynamic) firstprivate(lmax)
1497 for (int m=0; m<=lmax; m++) {
1498 const long ofs = m*(lmax+2) - (m*(m+1))/2;
1499 legendre_sphPlm_array(r->sht, lmax, m, r->cos_beta, r->plm_beta + ofs);
1500 }
1501 }
1502}
1503
1505void shtns_rotation_set_angles_ZXZ(shtns_rot r, double alpha, double beta, double gamma)
1506{
1507 shtns_rotation_set_angles_ZYZ(r, alpha+M_PI/2, beta, gamma-M_PI/2);
1508}
1509
1511void shtns_rotation_set_angle_axis(shtns_rot r, double theta, double Vx, double Vy, double Vz)
1512{
1513 if ((Vx==0) && (Vy==0)) { // rotation along Z-axis
1514 if (Vz<0) theta = -theta;
1515 shtns_rotation_set_angles_ZYZ(r, theta, 0, 0);
1516 } else {
1517 // 1) convert to normalized quaternion representation (c, Vx, Vy, Vz). See https://en.wikipedia.org/wiki/Versor
1518 double s = sin(0.5*theta);
1519 double c = cos(0.5*theta);
1520 double n = s / sqrt(Vx*Vx + Vy*Vy + Vz*Vz);
1521 Vx *= n; Vy *= n; Vz *= n;
1522
1523 // 2) convert from quaternion to extrinsic Euler angles:
1524 double beta = acos( 1.0 - 2.0*(Vx*Vx + Vy*Vy) ); // = acos( c*c + Vz*Vz - (Vx*Vx + Vy*Vy) );
1525 double Vxz = Vx*Vz;
1526 double Vyz = Vy*Vz;
1527 double cVx = c*Vx;
1528 double cVy = c*Vy;
1529 double alpha = atan2( Vyz - cVx, cVy + Vxz ); // note: for ZXZ convention: switch x with y and alpha with gamma.
1530 double gamma = atan2( Vyz + cVx, cVy - Vxz );
1531 shtns_rotation_set_angles_ZYZ(r, gamma, beta, alpha); // extrinsic rotation: swap gamma and alpha. See https://en.wikipedia.org/wiki/Euler_angles#Conventions_by_intrinsic_rotations
1532 }
1533}
1534
1538int quarter_wigner_d_matrix(shtns_rot r, const int l, double* mx, const int compressed)
1539{
1540 if ((l > r->lmax) || (l<0)) {
1541 printf("ERROR: 0 <= l <= lmax not satified.\n");
1542 return 0; // error, nothing written into mx.
1543 }
1544
1545 const int lmax = r->lmax + 1;
1546 const double cos_beta = r->cos_beta;
1547 const double sin_beta = r->sin_beta;
1548 const double* const plm_beta = r->plm_beta;
1549
1550 const int lw = (compressed) ? l+1 : 2*l+1;
1551 // shift mx to index by negative values:
1552 mx += l*lw + l*(compressed==0);
1553
1554 mx[0] = plm_beta[l]; // d(m'=0, m=0)
1555 // step 2+3: d(m'=0,m) and d(m'=1,m)
1556 double cb_p1 = (1. + cos_beta)*0.5;
1557 double cb_m1 = (1. - cos_beta)*0.5;
1558 const double d_1 = 1.0/((l)*(l+1.));
1559 for (int m=1; m<=l; m++) {
1560 const long ofs_m = m*(lmax+2) - (m*(m+1))/2;
1561 const long ofs_mm1 = (m-1)*(lmax+2) - ((m-1)*m)/2;
1562 const long ofs_mp1 = (m+1)*(lmax+2) - ((m+1)*(m+2))/2;
1563 mx[m] = plm_beta[ofs_m + (l-m)]; // m'=0 : copy plm values (eq 32)
1564
1565 double a = sqrt( ((l+1+m)*(l+1-m)) * d_1 ) * sin_beta;
1566 double b1 = sqrt( ((l-m+1)*(l-m+2)) * d_1 ) * cb_p1;
1567 double b2 = sqrt( ((l+m+1)*(l+m+2)) * d_1 ) * cb_m1;
1568 mx[lw + m] = b2*plm_beta[ofs_mp1 + (l+1)-(m+1)] + b1*plm_beta[ofs_mm1 + (l+1)-(m-1)] + a*plm_beta[ofs_m + (l+1)-m]; // m'=1 (eq 105)
1569 }
1570
1571 double clm[l+1]; // temporary array holding clm
1572 for (int m=0; m<l; m++) clm[m] = sqrt( (l-m)*(l+m+1) ); // precompute clm
1573 clm[l] = 0.0; // boundary condition handled with this
1574
1575 // step 5: recursively compute d(m',m), m'=-1; m=-m'..l
1576 { const int mp=-1; // m'=-1
1577 const double c_1 = 1.0/clm[0];
1578 for (int m=-mp; m<l; m++) {
1579 double H = mx[lw*(mp+2) + m] + c_1*( -clm[m-1] * mx[lw*(mp+1) + (m-1)] + clm[m] * mx[lw*(mp+1) + (m+1)]); // eq 108
1580 mx[lw*mp + m] = H; // d(m',m)
1581 }
1582 const int m=l;
1583 double H = mx[lw*(mp+2) + m] - c_1*clm[m-1] * mx[lw*(mp+1) + (m-1)]; // eq 108
1584 mx[lw*mp + m] = H; // d(m',m)
1585 }
1586
1587 // step 4+5 merged: recursively compute d(m',m), m'=2..l; m=m'..l AND m'=-2..-l; m=-m'..l
1588 for (int mp=2; mp<=l; mp++) {
1589 const double c_1 = 1.0/clm[mp-1];
1590 const double cmp2 = clm[mp-2];
1591 for (int m=mp; m<l; m++) {
1592 double H = cmp2 * mx[lw*(mp-2) + m] + clm[m-1] * mx[lw*(mp-1) + (m-1)] - clm[m] * mx[lw*(mp-1) + (m+1)]; // eq 106
1593 double H_ = cmp2 * mx[-lw*(mp-2) + m] - clm[m-1] * mx[-lw*(mp-1) + (m-1)] + clm[m] * mx[-lw*(mp-1) + (m+1)]; // eq 108
1594 mx[lw*mp + m] = H * c_1; // d(m',m)
1595 mx[-lw*mp + m] = H_ * c_1; // d(-m',m)
1596 }
1597 const int m=l;
1598 double H = cmp2 * mx[lw*(mp-2) + m] + clm[m-1] * mx[lw*(mp-1) + (m-1)]; // eq 106
1599 double H_ = cmp2 * mx[-lw*(mp-2) + m] - clm[m-1] * mx[-lw*(mp-1) + (m-1)]; // eq 108
1600 mx[lw*mp + m] = H * c_1; // d(m',m)
1601 mx[-lw*mp + m] = H_ * c_1; // d(-m',m)
1602 }
1603
1604 return lw; // return the number of lines (or columns) in the matrix.
1605}
1606
1607
1612int shtns_rotation_wigner_d_matrix(shtns_rot r, const int l, double* mx)
1613{
1614 // step 1:
1615 if (l==0) {
1616 mx[0] = 1;
1617 return 1;
1618 }
1619
1620 const int lw = quarter_wigner_d_matrix(r, l, mx, 0); // generate quarter of wigner-d matrix (but full storage)
1621 if (lw <= 0) return 0; // error
1622
1623 // shift mx to index by negative values:
1624 mx += l*(2*l+1) + l;
1625
1626 // step 6: fill matrix using symmetries
1627 for (int m=1; m<=l; m++) { // diagonals
1628 mx[(2*l+1)*m + -m] = mx[(2*l+1)*(-m) + m];
1629 mx[(2*l+1)*(-m) - m] = mx[(2*l+1)*m + m];
1630 }
1631 for (int mp=-l+1; mp<l; mp++) { // off-diagonals
1632 for (int m=abs(mp)+1; m<=l; m++) {
1633 double x = mx[(2*l+1)*mp + m];
1634 double parity = (1-2*((m-mp)&1));
1635 mx[(2*l+1)*(-m) - mp] = x;
1636 mx[(2*l+1)*m + mp] = x * parity;
1637 mx[(2*l+1)*(-mp) - m] = x * parity;
1638 }
1639 }
1640 return lw;
1641}
1642
1643/*
1646void shtns_rotation_apply_cplx(shtns_rot r, cplx* Zlm, cplx* Rlm)
1647{
1648 const int lmax = r->lmax;
1649
1650 Rlm[0] = Zlm[0]; // copy l=0
1651
1652 #pragma omp for schedule(dynamic)
1653 for (int l=1; l<=lmax; l++) {
1654 const int lw = l+1; // line width of compressed matrix.
1655 double* mx0 = (double*) malloc( sizeof(double) * (2*l+1)*lw );
1656 memset(mx0, 0, sizeof(double)*(2*l+1)*lw);
1657
1658 quarter_wigner_d_matrix(r, l, mx0, 1); // 1 : compressed matrix (saves memory and time)
1659 const double* mx = mx0 + l*lw; // allow indexing by negative m and m'
1660
1661 const cplx* zl = Zlm + l*(l+1);
1662 cplx* rl = Rlm + l*(l+1);
1663 // apply the matrix
1664 for (int m=-l; m<=l; m++) {
1665 cplx rm = 0.0;
1666 // only 1/4 of the matrix is read (4 times).
1667 for (int mp=l; mp>=abs(m); mp--) { // third domain (right) : populated.
1668 rm += zl[mp] * mx[m*lw + mp];
1669 }
1670 if (m<0) {
1671 for (int mp=-m-1; mp>=m; mp--) { // second domain (top)
1672 rm += zl[mp] * mx[-mp*lw - m]; // exchange m and m' and change signs => no parity factor
1673 }
1674 } else {
1675 for (int mp=m-1; mp>=-m; mp-=2) { // second domain (bottom) => always even number of elements
1676 rm -= zl[mp] * mx[mp*lw + m]; // exchange m and m' (negative parity)
1677 rm += zl[mp-1] * mx[(mp-1)*lw + m]; // exchange m and m' (positive parity)
1678 }
1679 }
1680 for (int mp=-abs(m)-1; mp >-l; mp-=2) { // first domain (left)
1681 rm -= zl[mp] * mx[-m*lw - mp]; // change sign of m and m'
1682 rm += zl[mp-1] * mx[-m*lw - (mp-1)]; // change sign of m and m'
1683 }
1684 if ((l+m)&1) { int mp = -l; // boundary condition.
1685 rm -= zl[mp] * mx[-m*lw - mp]; // change sign of m and m'
1686 }
1687 rl[m] = rm;
1688 }
1689
1690 free(mx0);
1691 }
1692}
1693*/
1694
1696{
1697 const int lmax = r->lmax+1;
1698 const int mmax = r->mmax;
1699
1700 if (r->beta == 0.0) { // only rotation along Z-axis.
1701 SH_Zrotate(r->sht, Qlm, r->alpha, Rlm);
1702 /* if (Rlm != Qlm) for (int l=0; l<lmax; l++) Rlm[l] = Qlm[l]; // copy m=0
1703 long lm = lmax;
1704 const cplx ei_alpha = r->eia;
1705 cplx eim_alpha = ei_alpha;
1706 for (int m=1; m<=mmax; m++) {
1707 for (int l=m; l<lmax; l++) {
1708 Rlm[lm] = Qlm[lm] * eim_alpha;
1709 lm++;
1710 }
1711 eim_alpha *= ei_alpha;
1712 }*/
1713 return;
1714 }
1715
1716 Rlm[0] = Qlm[0]; // copy l=0 (invariant by rotation)
1717
1718 #pragma omp parallel firstprivate(lmax,mmax)
1719 {
1720 const double cos_beta = r->cos_beta;
1721 const double sin_beta = r->sin_beta;
1722 const double* const plm_beta = r->plm_beta;
1723 const int flag_ag = r->flag_alpha_gamma;
1724
1725 const double cb_p1 = (1. + cos_beta)*0.5;
1726 const double cb_m1 = (1. - cos_beta)*0.5;
1727
1728 const double m0_renorm = r->m0_renorm;
1729 const double m0_renorm_1 = 1.0 / m0_renorm;
1730
1731 const cplx eia = r->eia;
1732 const cplx eig = r->eig;
1733
1734// cplx* const rl = (cplx*) malloc(sizeof(double) * (4*(l+1) + 5*lw +2)); // 2 real temp array 2*(l+1), + 5 lines of storage (lw) + a bit
1735 cplx* const rl = (cplx*) malloc(sizeof(double) * (9*lmax +2)); // 2 real temp array 2*(l+1), + 5 lines of storage (lw) + a bit
1736
1737 #pragma omp for schedule(dynamic,1)
1738 for (int l=lmax-1; l>0; l--) {
1739 const int lw = l+1; // line width of matrix.
1740 const int mlim = (l <= mmax) ? l : mmax;
1741 cplx* const ql = rl + (l+1);
1742 double* const m0 = (double*) (rl + 2*(l+1));
1743 memset(rl, 0, sizeof(cplx)*(2*l+1)); // zero the temp destination array.
1744
1745 // gather all m's for given l of source array:
1746 long lm = l;
1747 ql[0] = creal(Qlm[lm]) * m0_renorm; // m=0
1748 if (flag_ag & 1) { // pre-rotate along Z-axis
1749 cplx eima = eia;
1750 for (int m=1; m<=mlim; m++) {
1751 lm += lmax-m;
1752 ql[m] = Qlm[lm] * eima;
1753 eima *= eia;
1754 }
1755 } else { // copy
1756 for (int m=1; m<=mlim; m++) {
1757 lm += lmax-m;
1758 ((v2d*)ql)[m] = ((v2d*)Qlm)[lm];
1759 }
1760 }
1761 for (int m=mlim+1; m<=l; m++) ql[m] = 0; // zero out m that are absent.
1762
1763 m0[0] = plm_beta[l]; // d(m'=0, m=0)
1764 m0[lw] = 0.0; // required, as a boundary condition
1765 double r0 = 0.5*creal(ql[0])*m0[0]; // accumulator for rl[0]
1766
1767 // step 2+3: d(m'=0,m) and d(m'=1,m)
1768 const double d_1 = 1.0/((l)*(l+1.));
1769 cplx r1 = 0.0; // accumulator for rl[1]
1770 double parity = -1;
1771 long ofs_m = (lmax+2) - 1; // m*(lmax+2) - (m*(m+1))/2 for m=1
1772 long ofs_mm1 = 0; // m*(lmax+2) - (m*(m+1))/2 for m=0
1773 for (int m=1; m<=l; m++) {
1774 double H = plm_beta[ofs_m + (l-m)]; // m'=0 : copy plm values (eq 32)
1775 m0[m] = H;
1776 r0 += creal(ql[m]) * H; // right quadrant (m>0)
1777 const double a = sqrt( ((l+1+m)*(l+1-m)) * d_1 ) * sin_beta;
1778 rl[m] += creal(ql[0]) * H*parity; // bottom quadrant (m>0) : exchange m and m' => parity factor
1779 const double b1 = sqrt( ((l-m+1)*(l-m+2)) * d_1 ) * cb_p1;
1780 long ofs_mp1 = (m+1)*(lmax+2) - ((m+1)*(m+2))/2;
1781 const double b2 = sqrt( ((l+m+1)*(l+m+2)) * d_1 ) * cb_m1;
1782 parity = -parity;
1783 H = b2*plm_beta[ofs_mp1 + (l+1)-(m+1)] + b1*plm_beta[ofs_mm1 + (l+1)-(m-1)] + a*plm_beta[ofs_m + (l+1)-m]; // m'=1 (eq 105)
1784 m0[lw+m] = H; // m'=1
1785 r1 += ql[m] * H; // right quadrant (m>0)
1786 ofs_mm1 = ofs_m;
1787 ofs_m = ofs_mp1;
1788 if (m>1) { // avoid duplicates
1789 rl[m] += ql[1] * H*parity; // bottom quadrant (m>0) : exchange m and m' => parity factor
1790 }
1791 }
1792 rl[0] = r0+r0;
1793 rl[1] += r1;
1794
1795 double* clm = m0 + 4*lw +1; // array holding clm (size l+2, adressing by m=-1 to l)
1796 clm[-1] = 0.0;
1797 for (int m=0; m<l; m++) clm[m] = sqrt( (l-m)*(l+m+1) ); // precompute clm
1798 clm[l] = 0.0; // boundary condition handled with this
1799
1800 // step 5: recursively compute d(m',m), for m'=-1; m=1..l
1801 double* mx1_ = m0 + 2*lw;
1802 double* mx0_ = m0 + 3*lw;
1803 memcpy(mx1_, m0, sizeof(double)*2*lw); // first copy the initial lines (m'=0 and m'=1).
1804 #if _GCC_VEC_
1805 s2d conj_parity = {-1.0, 1.0}; // change sign of real or imaginary part only.
1806 #endif
1807 { const int mp = -1;
1808 double clm_1 = clm[0];
1809 const double c_1 = 1.0/clm_1; // 1.0/clm[l+mp];
1810 double mx1_1 = mx1_[-mp-1];
1811 double mx1_0 = mx1_[-mp];
1812 v2d rmp = vdup(0.0);
1813 #if _GCC_VEC_
1814 v2d zlmp = ((v2d*)ql)[-mp] * conj_parity;
1815 #else
1816 cplx zlmp = conj(ql[-mp]) * (1-2*(mp&1));
1817 #endif
1818 int m = -mp;
1819 for (; m<l; m+=2) {
1820 double clm0 = clm[m];
1821 double clm1 = clm[m+1];
1822 double mx11 = mx1_[m+1];
1823 double mx12 = mx1_[m+2];
1824 double H0 = mx0_[m] + c_1*( -clm_1 * mx1_1 + clm0 * mx11 ); // eq 108
1825 double H1 = mx0_[m+1] + c_1*( -clm0 * mx1_0 + clm1 * mx12 );
1826 clm_1 = clm1; mx1_1 = mx11; mx1_0 = mx12; // cycle coefficients and matrix elements.
1827 mx0_[m] = H0; // d(m',m) -> overwrite d(m'+2,m)
1828 mx0_[m+1] = H1; // d(m',m) -> overwrite d(m'+2,m)
1829 rmp += ((v2d*)ql)[m] * vdup(H0) + ((v2d*)ql)[m+1] * vdup(H1); // left quadrant, change signs => parity factor
1830 if (m>-mp) { // avoid duplicates
1831 ((v2d*)rl)[m] += zlmp * vdup(H0); // bottom quadrant (m>0) : exchange m and m' => parity factor
1832 }
1833 ((v2d*)rl)[m+1] -= zlmp * vdup(H1); // bottom quadrant (m>0) : exchange m and m' => parity factor
1834 }
1835 if (m==l) {
1836 double H0 = mx0_[m] - c_1 * clm_1 * mx1_1; // eq 108
1837 mx0_[m] = H0; // d(m',m) -> overwrite d(m'+2,m)
1838 rmp += ((v2d*)ql)[m] * vdup(H0); // left quadrant, change signs => parity factor
1839 if (m>-mp) { // avoid duplicates
1840 ((v2d*)rl)[m] += zlmp * vdup(H0); // bottom quadrant (m>0) : exchange m and m' => parity factor
1841 }
1842 }
1843 #if _GCC_VEC_
1844 ((v2d*)rl)[-mp] += rmp * conj_parity;
1845 conj_parity = - conj_parity; // switch parity
1846 #else
1847 rl[-mp] += conj(rmp)*(1-2*(mp&1)); // -mp > 0
1848 #endif
1849 double* t = mx0_; mx0_ = mx1_; mx1_ = t; // cycle the buffers for lines.
1850 }
1851
1852 // step 4 + 5 merged: recursively compute and apply d(m',m), m'=2..l; m=m'..l AND m'=-2..-l; m=-m'..l
1853 double* mx0 = m0;
1854 double* mx1 = m0 + lw;
1855 for (int mp=2; mp<=l; mp++) {
1856 const double cmp2 = clm[mp-2];
1857 double clm_1 = clm[mp-1];
1858 const double c_1 = 1.0/clm_1;
1859
1860 v2d rmp = vdup(0.0);
1861 v2d zlmp = ((v2d*)ql)[mp];
1862 v2d rmp_ = vdup(0.0);
1863 #if _GCC_VEC_
1864 v2d zlmp_ = zlmp * conj_parity;
1865 #else
1866 cplx zlmp_ = conj(zlmp) * (1-2*(mp&1));
1867 #endif
1868
1869 int m = mp;
1870 for (; m<=l-(VSIZE2-1); m+=VSIZE2) {
1871 rnd clm_1 = *((rndu*)(clm+m-1));
1872 rnd clm0 = *((rndu*)(clm+m));
1873
1874 rnd mx1_1 = *((rndu*)(mx1+m-1));
1875 rnd mx11 = *((rndu*)(mx1+m+1));
1876
1877 rnd mx1__1 = *((rndu*)(mx1_+m-1));
1878 rnd mx11_ = *((rndu*)(mx1_+m+1));
1879
1880 rnd mx00 = *((rndu*)(mx0+m));
1881 rnd mx00_ = *((rndu*)(mx0_+m));
1882
1883 rnd H = (vall(cmp2) * mx00 + clm_1 * mx1_1 - clm0 * mx11) * vall(c_1); // eq 106
1884 rnd H_ = (vall(cmp2) * mx00_ - clm_1 * mx1__1 + clm0 * mx11_) * vall(c_1); // eq 108
1885 *((rndu*)(mx0+m)) = H;
1886 *((rndu*)(mx0_+m)) = H_;
1887 }
1888 /* v2d mx1_1 = *((v2d*)(mx1+mp-1));
1889 v2d mx1__1 = *((v2d*)(mx1_+mp-1));
1890 for (; m<l; m+=2) {
1891 v2d clm_1 = *((v2du*)(clm+m-1));
1892 v2d clm0 = *((v2du*)(clm+m));
1893
1894 v2d mx11 = *((v2du*)(mx1+m+1));
1895
1896 v2d mx11_ = *((v2du*)(mx1_+m+1));
1897
1898 v2d mx00 = *((v2du*)(mx0+m));
1899 v2d mx00_ = *((v2du*)(mx0_+m));
1900
1901 v2d H = vdup(cmp2) * mx00 + clm_1 * mx1_1 - clm0 * mx11; // eq 106
1902 v2d H_ = vdup(cmp2) * mx00_ - clm_1 * mx1__1 + clm0 * mx11_; // eq 108
1903 mx1__1 = mx11_; mx1_1 = mx11;
1904 H *= vdup(c_1);
1905 H_ *= vdup(c_1);
1906 *((v2du*)(mx0+m)) = H;
1907 *((v2du*)(mx0_+m)) = H_;
1908 } */
1909 for (; m<=l; m++) {
1910 double clm_1 = clm[m-1];
1911 double clm0 = clm[m];
1912 double mx1_1 = mx1[m-1];
1913 double mx1__1 = mx1_[m-1];
1914 double mx11 = mx1[m+1];
1915 double mx11_ = mx1_[m+1];
1916 double H0 = cmp2 * mx0[m] + clm_1 * mx1_1 - clm0 * mx11; // eq 106
1917 double H0_ = cmp2 * mx0_[m] - clm_1 * mx1__1 + clm0 * mx11_; // eq 108
1918 H0 *= c_1; //mx[lw*mp + m] = H * c_1; // d(m',m)
1919 H0_ *= c_1; //mx[lw*mp + m] = H * c_1; // d(m',m)
1920 mx0[m] = H0; // d(m',m) -> overwrite d(m'-2,m)
1921 mx0_[m] = H0_; // d(m',m) -> overwrite d(m'+2,m)
1922 }
1923
1924 m = mp;
1925 for (; m<l; m+=2) {
1926 double H0 = mx0[m]; double H1 = mx0[m+1];
1927 double H0_ = mx0_[m]; double H1_ = mx0_[m+1];
1928 rmp += ((v2d*)ql)[m] * vdup(H0) + ((v2d*)ql)[m+1] * vdup(H1); //mx[mp*lw + m]; // right quadrant (m>0, mp>0)
1929 rmp_ += ((v2d*)ql)[m] * vdup(H0_) + ((v2d*)ql)[m+1] * vdup(H1_); // left quadrant, change signs => parity factor
1930 if (m>mp) { // avoid duplicates
1931 ((v2d*)rl)[m] += zlmp * vdup(H0) + zlmp_ * vdup(H0_); // bottom quadrant (m>0) : exchange m and m' => parity factor
1932 }
1933 ((v2d*)rl)[m+1] -= zlmp * vdup(H1) + zlmp_ * vdup(H1_); // bottom quadrant (m>0) : exchange m and m' => parity factor
1934 }
1935 if (m==l) {
1936 double H0 = mx0[m]; // d(m',m) -> overwrite d(m'-2,m)
1937 double H0_ = mx0_[m]; // d(m',m) -> overwrite d(m'+2,m)
1938 rmp += ((v2d*)ql)[m] * vdup(H0); //mx[mp*lw + m]; // right quadrant (m>0, mp>0)
1939 rmp_ += ((v2d*)ql)[m] * vdup(H0_); // left quadrant, change signs => parity factor
1940 if (m>mp) { // avoid duplicates
1941 ((v2d*)rl)[m] += zlmp * vdup(H0) + zlmp_ * vdup(H0_); // bottom quadrant (m>0) : exchange m and m' => parity factor
1942 }
1943 }
1944 #if _GCC_VEC_
1945 ((v2d*)rl)[mp] += rmp + rmp_ * conj_parity;
1946 conj_parity = - conj_parity; // switch parity
1947 #else
1948 rl[mp] += rmp + conj(rmp_)*(1-2*(mp&1)); // -mp > 0
1949 #endif
1950 double* t = mx0; mx0 = mx1; mx1 = t; // cycle the buffers for lines.
1951 double* t_ = mx0_; mx0_ = mx1_; mx1_ = t_; // cycle the buffers for lines.
1952 }
1953
1954 // scatter all m's for current l into dest array:
1955 lm = l;
1956 Rlm[lm] = creal(rl[0]) * m0_renorm_1; // m=0
1957 if (flag_ag & 2) { // post-rotate along Z-axis
1958 cplx eimg = eig;
1959 for (int m=1; m<=mlim; m++) {
1960 lm += lmax-m;
1961 Rlm[lm] = rl[m] * eimg;
1962 eimg *= eig;
1963 }
1964 } else { // copy values to dest
1965 for (int m=1; m<=mlim; m++) {
1966 lm += lmax-m;
1967 ((v2d*)Rlm)[lm] = ((v2d*)rl)[m];
1968 }
1969 }
1970 }
1971
1972 free(rl);
1973 }
1974}
1975
1978{
1979 const int lmax = r->lmax+1;
1980 const int mmax = r->mmax;
1981
1982// indexing scheme:
1983// if (l<=MMAX) : l*(l+1) + m
1984// if (l>=MMAX) : l*(2*mmax+1) - mmax*mmax + m = mmax*(2*l-mmax) + l+m
1985
1986 if (r->beta == 0.0) { // only rotation along Z-axis.
1987 long lm = 0;
1988 const cplx eia = r->eia;
1989 for (int l=0; l<lmax; l++) {
1990 long ll = (l<=mmax) ? l*(l+1) : mmax*(2*l-mmax) + l;
1991 cplx eim_alpha = eia;
1992 Rlm[ll + 0] = Zlm[ll + 0]; // copy m=0;
1993 for (int m=1; m<=l; m++) {
1994 Rlm[ll - m] = Zlm[ll - m] * conj(eim_alpha);
1995 Rlm[ll + m] = Zlm[ll + m] * eim_alpha;
1996 eim_alpha *= eia;
1997 }
1998 }
1999 return;
2000 }
2001
2002 Rlm[0] = Zlm[0]; // copy l=0 (invariant by rotation)
2003
2004 #pragma omp parallel firstprivate(lmax,mmax)
2005 {
2006 const double cos_beta = r->cos_beta;
2007 const double sin_beta = r->sin_beta;
2008 const double* const plm_beta = r->plm_beta;
2009
2010 const double cb_p1 = (1. + cos_beta)*0.5;
2011 const double cb_m1 = (1. - cos_beta)*0.5;
2012
2013 const int flag_ag = r->flag_alpha_gamma;
2014 const cplx eia = r->eia;
2015 const cplx eig = r->eig;
2016 const int ntmp = 1 + (flag_ag & 1);
2017
2018// v2d* rl = (v2d*) malloc(sizeof(double) * (2*ntmp*(2*l+1) + 5*lw +2)); // 1 cplx temp array (2l+1), + 5 lines of storage (lw) + a bit
2019 v2d* const buf = (v2d*) malloc(sizeof(double) * (2*ntmp*(2*lmax-1) + 5*lmax +2)); // 1 cplx temp array (2l+1), + 5 lines of storage (lw) + a bit
2020
2021 #pragma omp for schedule(dynamic,1)
2022 for (int l=lmax-1; l>0; l--) {
2023 const int lw = l+1; // line width of matrix.
2024 const int mlim = (l <= mmax) ? l : mmax;
2025 const long ll = (l <= mmax) ? l*(l+1) : mmax*(2*l-mmax) + l;
2026 v2d* rl = buf;
2027 double* const m0 = (double*) (buf + ntmp*(2*l+1));
2028 memset(rl, 0, sizeof(cplx)*(2*l+1)); // zero the temp dest array.
2029 rl += l; // shift pointer to allow indexing by m = -l to l
2030 v2d* zl = (v2d*)Zlm + l*(l+1); // source array, index by mp = -l to l, assumed aligned for sse2
2031 if (flag_ag & 1) { // pre-rotate arount Z-axis
2032 zl = rl + 2*l+1;
2033 ((cplx*)zl)[0] = Zlm[l*(l+1) + 0];
2034 cplx eim_alpha = eia;
2035 for (int m=1; m<=mlim; m++) {
2036 ((cplx*)zl)[-m] = Zlm[ll - m] * conj(eim_alpha);
2037 ((cplx*)zl)[m] = Zlm[ll + m] * eim_alpha;
2038 eim_alpha *= eia;
2039 }
2040 } else if (mmax < l) { // we must copy the source to pad it with zeros
2041 zl = rl + 2*l+1;
2042 for (int m=-mmax; m<=mmax; m++) ((cplx*)zl)[m] = Zlm[ll+m];
2043 }
2044 for (int m=mlim+1; m<=l; m++) { // pad source with zeros
2045 ((cplx*)zl)[-m] = 0;
2046 ((cplx*)zl)[m] = 0;
2047 }
2048
2049 m0[0] = plm_beta[l]; // d(m'=0, m=0)
2050 m0[lw] = 0.0; // required, as a boundary condition
2051 v2d r0 = zl[0]*vdup(m0[0]); // accumulator for rl[0]
2052
2053 // step 2+3: d(m'=0,m) and d(m'=1,m)
2054 const double d_1 = 1.0/((l)*(l+1.));
2055 v2d r1 = vdup(0.0); // accumulator for rl[1]
2056 v2d rm1 = vdup(0.0); // accumulator for rl[-1]
2057 double parity = -1;
2058 long ofs_m = (lmax+2) - 1; // m*(lmax+2) - (m*(m+1))/2 for m=1
2059 long ofs_mm1 = 0; // m*(lmax+2) - (m*(m+1))/2 for m=0
2060 for (int m=1; m<=l; m++) {
2061 double H = plm_beta[ofs_m + (l-m)]; // m'=0 : copy plm values (eq 32)
2062 m0[m] = H;
2063 r0 += zl[m] * vdup(H); // right quadrant (m>0)
2064 r0 += zl[-m] * vdup(H*parity); // left quadrant (m<0) : change signs => parity factor
2065 const double a = sqrt( ((l+1+m)*(l+1-m)) * d_1 ) * sin_beta;
2066 rl[-m] += zl[0] * vdup(H); // top quadrant (m<0) : exchange m and m' and change signs => no parity factor
2067 rl[m] += zl[0] * vdup(H*parity); // bottom quadrant (m>0) : exchange m and m' => parity factor
2068 const double b1 = sqrt( ((l-m+1)*(l-m+2)) * d_1 ) * cb_p1;
2069 long ofs_mp1 = (m+1)*(lmax+2) - ((m+1)*(m+2))/2;
2070 const double b2 = sqrt( ((l+m+1)*(l+m+2)) * d_1 ) * cb_m1;
2071 parity = -parity;
2072 H = b2*plm_beta[ofs_mp1 + (l+1)-(m+1)] + b1*plm_beta[ofs_mm1 + (l+1)-(m-1)] + a*plm_beta[ofs_m + (l+1)-m]; // m'=1 (eq 105)
2073 m0[lw+m] = H;
2074 r1 += zl[m] * vdup(H); // right quadrant (m>0)
2075 ofs_mm1 = ofs_m;
2076 rm1 += zl[-m] * vdup(H*parity); // left quadrant: change signs => parity factor
2077 ofs_m = ofs_mp1;
2078 if (m>1) { // avoid duplicates
2079 rl[-m] += zl[-1] * vdup(H); // top quadrant (m<0) : exchange m and m' and change signs => no parity factor
2080 rl[m] += zl[1] * vdup(H*parity); // bottom quadrant (m>0) : exchange m and m' => parity factor
2081 }
2082 }
2083 rl[-1] += rm1;
2084 rl[0] = r0;
2085 rl[1] += r1;
2086
2087 double* clm = m0 + 4*lw +1; // array holding clm (size l+2, adressing by m=-1 to l)
2088 clm[-1] = 0.0; // also for BC.
2089 for (int m=0; m<l; m++) clm[m] = sqrt( (l-m)*(l+m+1) ); // precompute clm
2090 clm[l] = 0.0; // boundary condition handled with this
2091
2092 // step 5: recursively compute d(m',m), for m'=-1; m=1..l
2093 double* mx1_ = m0 + 2*lw;
2094 double* mx0_ = m0 + 3*lw;
2095 memcpy(mx1_, m0, sizeof(double)*2*lw); // first copy the initial lines (m'=0 and m'=1).
2096 { const int mp = -1; // m'=-1
2097 double clm_1 = clm[-mp-1];
2098 const double c_1 = 1.0/clm_1; // 1.0/clm[l+mp];
2099 double mx1_1 = mx1_[-mp-1];
2100 double mx1_0 = mx1_[-mp];
2101 v2d rmp = vdup(0.0);
2102 v2d rmmp = vdup(0.0);
2103 v2d zlmmp = zl[-mp];
2104 v2d zlmp = zl[mp];
2105 int m = -mp;
2106 for (; m<l; m+=2) {
2107 double clm0 = clm[m];
2108 double clm1 = clm[m+1];
2109 double mx11 = mx1_[m+1];
2110 double mx12 = mx1_[m+2];
2111 double H0 = mx0_[m] + c_1*( - clm_1 * mx1_1 + clm0 * mx11); // eq 108
2112 double H1 = mx0_[m+1] + c_1*( - clm0 * mx1_0 + clm1 * mx12);
2113 clm_1 = clm1; mx1_1 = mx11; mx1_0 = mx12; // cycle coefficients and matrix elements.
2114 mx0_[m] = H0; // d(m',m) -> overwrite d(m'+2,m)
2115 mx0_[m+1] = H1; // d(m',m) -> overwrite d(m'+2,m)
2116 rmp += zl[m] * vdup(H0) + zl[m+1] * vdup(H1); //mx[mp*lw + m]; // right quadrant (m>0, mp<0)
2117 rmmp += zl[-m] * vdup(H0) - zl[-m-1] * vdup(H1); // left quadrant, change signs => parity factor
2118 rl[-m-1] += zlmmp * vdup(H1); // top quadrant (-m<0) : exchange m and m' and change signs => no parity factor
2119 if (m>-mp) { // avoid duplicates
2120 rl[-m] += zlmmp * vdup(H0); // top quadrant (-m<0) : exchange m and m' and change signs => no parity factor
2121 rl[m] += zlmp * vdup(H0); // bottom quadrant (m>0) : exchange m and m' => parity factor
2122 }
2123 rl[m+1] -= zlmp * vdup(H1); // bottom quadrant (m>0) : exchange m and m' => parity factor
2124 }
2125 if(m==l) {
2126 double H0 = mx0_[m] - c_1 * clm_1 * mx1_1; // eq 108
2127 mx0_[m] = H0; // d(m',m) -> overwrite d(m'+2,m)
2128 rmp += zl[m] * vdup(H0); //mx[mp*lw + m]; // right quadrant (m>0, mp<0)
2129 rmmp += zl[-m] * vdup(H0); // left quadrant, change signs => parity factor
2130 if (m>-mp) { // avoid duplicates
2131 rl[-m] += zlmmp * vdup(H0); // top quadrant (-m<0) : exchange m and m' and change signs => no parity factor
2132 rl[m] += zlmp * vdup(H0); // bottom quadrant (m>0) : exchange m and m' => parity factor
2133 }
2134 }
2135 rl[-mp] += rmmp;
2136 rl[mp] += rmp;
2137 double* t = mx0_; mx0_ = mx1_; mx1_ = t; // cycle the buffers for lines.
2138 }
2139
2140 // step 4 + 5 merged: recursively compute and apply d(m',m), m'=2..l; m=m'..l AND m'=-2..-l; m=-m'..l
2141 double* mx0 = m0;
2142 double* mx1 = m0 + lw;
2143 for (int mp=2; mp<=l; mp++) {
2144 const double cmp2 = clm[mp-2];
2145 double clm_1 = clm[mp-1];
2146 const double c_1 = 1.0/clm_1;
2147 //v2d mx1_1 = *((v2d*)(mx1+mp-1));
2148 //v2d mx1__1 = *((v2d*)(mx1_+mp-1));
2149 v2d rmp = vdup(0.0);
2150 v2d rmmp = vdup(0.0);
2151 v2d zlmmp = zl[-mp];
2152 v2d zlmp = zl[mp];
2153 int m = mp;
2154 for (; m<=l-(VSIZE2-1); m+=VSIZE2) {
2155 rnd clm_1 = *((rndu*)(clm+m-1));
2156 rnd clm0 = *((rndu*)(clm+m));
2157
2158 rnd mx1_1 = *((rndu*)(mx1+m-1));
2159 rnd mx11 = *((rndu*)(mx1+m+1));
2160
2161 rnd mx1__1 = *((rndu*)(mx1_+m-1));
2162 rnd mx11_ = *((rndu*)(mx1_+m+1));
2163
2164 rnd mx00 = *((rndu*)(mx0+m));
2165 rnd mx00_ = *((rndu*)(mx0_+m));
2166
2167 rnd H = (vall(cmp2) * mx00 + clm_1 * mx1_1 - clm0 * mx11) * vall(c_1); // eq 106
2168 rnd H_ = (vall(cmp2) * mx00_ - clm_1 * mx1__1 + clm0 * mx11_) * vall(c_1); // eq 108
2169 //mx1__1 = mx11_; mx1_1 = mx11;
2170 *((rndu*)(mx0+m)) = H; // d(m',m) -> overwrite d(m'-2,m)
2171 *((rndu*)(mx0_+m)) = H_; // d(-m',m) -> overwrite d(-m'+2,m)
2172 }
2173 /* v2d mx1_1 = *((v2du*)(mx1+m-1));
2174 v2d mx1__1 = *((v2du*)(mx1_+m-1));
2175 for (; m<l; m+=2) {
2176 v2d clm_1 = *((v2du*)(clm+m-1));
2177 v2d clm0 = *((v2du*)(clm+m));
2178
2179 v2d mx11 = *((v2du*)(mx1+m+1));
2180
2181 v2d mx11_ = *((v2du*)(mx1_+m+1));
2182
2183 v2d mx00 = *((v2du*)(mx0+m));
2184 v2d mx00_ = *((v2du*)(mx0_+m));
2185
2186 v2d H = vdup(cmp2) * mx00 + clm_1 * mx1_1 - clm0 * mx11; // eq 106
2187 v2d H_ = vdup(cmp2) * mx00_ - clm_1 * mx1__1 + clm0 * mx11_; // eq 108
2188 mx1__1 = mx11_; mx1_1 = mx11;
2189 H *= vdup(c_1);
2190 H_ *= vdup(c_1);
2191 *((v2du*)(mx0+m)) = H;
2192 *((v2du*)(mx0_+m)) = H_;
2193 } */
2194 for (; m<=l; m++) {
2195 double clm_1 = clm[m-1];
2196 double clm0 = clm[m];
2197 double mx1_1 = mx1[m-1];
2198 double mx1__1 = mx1_[m-1];
2199 double mx11 = mx1[m+1];
2200 double mx11_ = mx1_[m+1];
2201 double H0 = (cmp2 * mx0[m] + clm_1 * mx1_1 - clm0 * mx11) * c_1; // eq 106
2202 double H0_ = (cmp2 * mx0_[m] - clm_1 * mx1__1 + clm0 * mx11_) * c_1; // eq 108
2203 mx0[m] = H0; // d(m',m) -> overwrite d(m'-2,m)
2204 mx0_[m] = H0_; // d(-m',m) -> overwrite d(-m'+2,m)
2205 }
2206
2207 m = mp;
2208 for (; m<l; m+=2) {
2209 double H0 = mx0[m]; double H1 = mx0[m+1];
2210 double H0_ = mx0_[m]; double H1_ = mx0_[m+1];
2211 rmp += zl[m] * vdup(H0) + zl[m+1] * vdup(H1); //mx[mp*lw + m]; // right quadrant (m>0, mp>0)
2212 rmmp += zl[m] * vdup(H0_) + zl[m+1] * vdup(H1_); //mx[mp*lw + m]; // right quadrant (m>0, mp<0)
2213 rmmp += zl[-m] * vdup(H0) - zl[-m-1] * vdup(H1); // left quadrant, change signs => parity factor
2214 rmp += zl[-m] * vdup(H0_) - zl[-m-1] * vdup(H1_); // left quadrant, change signs => parity factor
2215 rl[-m-1] += zlmmp * vdup(H1) + zlmp * vdup(H1_); // top quadrant (m<0) : exchange m and m' and change signs => no parity factor
2216 if (m>mp) { // avoid duplicates
2217 rl[-m] += zlmmp * vdup(H0) + zlmp * vdup(H0_); // top quadrant (m<0) : exchange m and m' and change signs => no parity factor
2218 rl[m] += zlmp * vdup(H0) + zlmmp * vdup(H0_); // bottom quadrant (m>0) : exchange m and m' => parity factor
2219 }
2220 rl[m+1] -= zlmp * vdup(H1) + zlmmp * vdup(H1_); // bottom quadrant (m>0) : exchange m and m' => parity factor
2221 }
2222 if (m==l) {
2223 double H0 = mx0[m];
2224 double H0_ = mx0_[m];
2225 rmp += zl[m] * vdup(H0); //mx[mp*lw + m]; // right quadrant (m>0, mp>0)
2226 rmmp += zl[m] * vdup(H0_); //mx[mp*lw + m]; // right quadrant (m>0, mp<0)
2227 rmmp += zl[-m] * vdup(H0); // left quadrant, change signs => parity factor
2228 rmp += zl[-m] * vdup(H0_); // left quadrant, change signs => parity factor
2229 if (m>mp) { // avoid duplicates
2230 rl[-m] += zlmmp * vdup(H0) + zlmp * vdup(H0_); // top quadrant (m<0) : exchange m and m' and change signs => no parity factor
2231 rl[m] += zlmp * vdup(H0) + zlmmp * vdup(H0_); // bottom quadrant (m>0) : exchange m and m' => parity factor
2232 }
2233 }
2234 rl[-mp] += rmmp;
2235 rl[mp] += rmp;
2236 double* t = mx0; mx0 = mx1; mx1 = t; // cycle the buffers for lines.
2237 double* t_ = mx0_; mx0_ = mx1_; mx1_ = t_; // cycle the buffers for lines.
2238 }
2239
2240 if (flag_ag & 2) { // apply post-rotation along Z-axis.
2241 Rlm[ll + 0] = ((cplx*)rl)[0];
2242 cplx eim_gamma = eig;
2243 for (int m=1; m<=mlim; m++) {
2244 Rlm[ll - m] = ((cplx*)rl)[-m] * conj(eim_gamma);
2245 Rlm[ll + m] = ((cplx*)rl)[m] * eim_gamma;
2246 eim_gamma *= eig;
2247 }
2248 } else {
2249 memcpy(Rlm + ll-mlim, rl-mlim, sizeof(cplx)*(2*mlim+1)); // copy to destination (avoid false sharing when doing openmp).
2250 }
2251 }
2252
2253 free(buf);
2254 }
2255}
void SH_Xrotate90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm)
rotate Qlm by 90 degrees around X axis and store the result in Rlm.
Definition sht_func.c:367
void SH_Yrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
rotate Qlm around Y axis by arbitrary angle, using composition of rotations. Store the result in Rlm.
Definition sht_func.c:404
void SH_Zrotate(shtns_cfg shtns, cplx *Qlm, double alpha, cplx *Rlm)
Rotate a SH representation Qlm around the z-axis by angle alpha (in radians), which is the same as ro...
Definition sht_func.c:38
void SH_Yrotate90(shtns_cfg shtns, cplx *Qlm, cplx *Rlm)
rotate Qlm by 90 degrees around Y axis and store the result in Rlm.
Definition sht_func.c:386
void shtns_destroy(shtns_cfg)
free memory of given config, which cannot be used afterwards.
Definition sht_init.c:1268
shtns_cfg shtns_create(int lmax, int mmax, int mres, enum shtns_norm norm)
Defines the sizes of the spectral description. Use for advanced initialization.
Definition sht_init.c:1100
long nlm_calc(long lmax, long mmax, long mres)
compute number of spherical harmonics modes (l,m) to represent a real-valued spatial field for given ...
Definition sht_init.c:98
void SHqst_to_point(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double cost, double phi, double *vr, double *vt, double *vp)
Evaluate vector SH representation Qlm at physical point defined by cost = cos(theta) and phi.
Definition sht_func.c:694
void SH_to_point_cplx(shtns_cfg shtns, cplx *alm, double cost, double phi, cplx *z_out)
Evaluate scalar SH representation of complex field alm at physical point defined by cost = cos(theta)...
Definition sht_func.c:541
void SH_to_lat(shtns_cfg shtns, cplx *Qlm, double cost, double *vr, int nphi, int ltr, int mtr)
synthesis at a given latitude, on nphi equispaced longitude points.
Definition sht_func.c:840
void SHqst_to_lat(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double cost, double *vr, double *vt, double *vp, int nphi, int ltr, int mtr)
synthesis at a given latitude, on nphi equispaced longitude points.
Definition sht_func.c:757
double SH_to_point(shtns_cfg shtns, cplx *Qlm, double cost, double phi)
Evaluate scalar SH representation Qlm at physical point defined by cost = cos(theta) and phi.
Definition sht_func.c:610
void SH_mul_mx(shtns_cfg shtns, double *mx, cplx *Qlm, cplx *Rlm)
Multiplication of Qlm by a matrix involving l+1 and l-1 only.
Definition sht_func.c:509
void st_dt_matrix(shtns_cfg shtns, double *mx)
fill mx with the coefficients of operator sin(theta).d/dtheta
Definition sht_func.c:496
void mul_ct_matrix(shtns_cfg shtns, double *mx)
fill mx with the coefficients for multiplication by cos(theta)
Definition sht_func.c:478
void shtns_rotation_set_angle_axis(shtns_rot r, double theta, double Vx, double Vy, double Vz)
Sets a rotation by angle theta around axis of cartesian coordinates (Vx,Vy,Vz), and compute associate...
Definition sht_func.c:1511
void shtns_rotation_apply_cplx(shtns_rot r, cplx *Zlm, cplx *Rlm)
rotate Zlm, store the result in Rlm, without ever storing the wigner-d matrices (on-the-fly operation...
Definition sht_func.c:1977
void shtns_rotation_destroy(shtns_rot r)
Release memory allocated for the rotation object.
Definition sht_func.c:1436
void shtns_rotation_set_angles_ZXZ(shtns_rot r, double alpha, double beta, double gamma)
Set the rotation angles, and compute associated Legendre functions, given the 3 intrinsic Euler angle...
Definition sht_func.c:1505
shtns_rot shtns_rotation_create(const int lmax, const int mmax, int norm)
Allocate memory and precompute some recurrence coefficients for rotation (independent of angle).
Definition sht_func.c:1419
void shtns_rotation_set_angles_ZYZ(shtns_rot r, double alpha, double beta, double gamma)
Set the rotation angles, and compute associated Legendre functions, given the 3 intrinsic Euler angle...
Definition sht_func.c:1468
void shtns_rotation_apply_real(shtns_rot r, cplx *Qlm, cplx *Rlm)
apply rotation to the orthonormal spherical harmonic expansion Qlm of a real field,...
Definition sht_func.c:1695
int shtns_rotation_wigner_d_matrix(shtns_rot r, const int l, double *mx)
Generate spherical-harmonic rotation matrix for given degree l and orthonormal convention (Wigner-d m...
Definition sht_func.c:1612
void SHqst_to_spat_cplx(shtns_cfg shtns, cplx *qlm, cplx *slm, cplx *tlm, cplx *zr, cplx *zt, cplx *zp)
complex vector transform (3D).
Definition sht_func.c:1231
void SHsphtor_to_spat_cplx(shtns_cfg shtns, cplx *slm, cplx *tlm, cplx *zt, cplx *zp)
complex vector transform (2D).
Definition sht_func.c:1085
void spat_cplx_to_SHsphtor(shtns_cfg shtns, cplx *zt, cplx *zp, cplx *slm, cplx *tlm)
complex vector transform (2D).
Definition sht_func.c:1160
void spat_cplx_to_SHqst(shtns_cfg shtns, cplx *zr, cplx *zt, cplx *zp, cplx *qlm, cplx *slm, cplx *tlm)
complex vector transform (3D).
Definition sht_func.c:1222
void SH_to_spat_cplx(shtns_cfg shtns, cplx *alm, cplx *z)
complex scalar transform.
Definition sht_func.c:1023
void spat_cplx_to_SH(shtns_cfg shtns, cplx *z, cplx *alm)
complex scalar transform.
Definition sht_func.c:968
struct shtns_rot_ * shtns_rot
pointer to data structure describing a rotation, returned by shtns_rotation_create().
Definition shtns.h:44
_Complex double cplx
double precision complex number data type
Definition shtns.h:28
#define LM(shtns, l, m)
LM(shtns, l,m) : macro returning array index for given l and m, corresponding to config shtns.
Definition shtns.h:108
int legendre_sphPlm_array(shtns_cfg shtns, const int lmax, const int im, const double x, double *yl)
Compute values of legendre polynomials noramalized for spherical harmonics, for a range of l=m....
#define SHT_REAL_NORM
use a "real" normalization. (add to last argument of shtns_create)
Definition shtns.h:54
#define SHT_NO_CS_PHASE
don't include Condon-Shortley phase (add to last argument of shtns_create)
Definition shtns.h:53
@ sht_schmidt
Schmidt semi-normalized : 4.pi/(2l+1)
Definition shtns.h:50
#define LiM(shtns, l, im)
LiM(shtns, l,im) : macro returning array index for given l and im, corresponding to config shtns.
Definition shtns.h:106
Definition shtns.h:80
const unsigned short lmax
maximum degree (lmax) of spherical harmonics.
Definition shtns.h:82
const unsigned short *const li
degree l for given mode index (size nlm) : li[lm]
Definition shtns.h:89
const unsigned short mmax
maximum order (mmax*mres) of spherical harmonics.
Definition shtns.h:83
const unsigned int nspat
number of real numbers that must be allocated in a spatial field.
Definition shtns.h:88
const unsigned short mres
the periodicity along the phi axis.
Definition shtns.h:84