Browse code

general convolution filtering of the source picture dynamic memory allocation for the buffers (needed for the filter stuff)

Originally committed as revision 4291 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

Michael Niedermayer authored on 2002/01/21 12:56:47
Showing 3 changed files
... ...
@@ -443,7 +443,7 @@ static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
443 443
 // minor note: the HAVE_xyz is messed up after that line so dont use it
444 444
 
445 445
 
446
-// old global scaler, dont use for new code
446
+// old global scaler, dont use for new code, unless it uses only the stuff from the command line
447 447
 // will use sws_flags from the command line
448 448
 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
449 449
 			     int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
... ...
@@ -454,11 +454,31 @@ void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
454 454
 	int flags=0;
455 455
 	static int firstTime=1;
456 456
 	int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
457
+	static SwsFilter srcFilter={NULL, NULL, NULL, NULL};
457 458
 
458 459
 	if(firstTime)
459 460
 	{
460 461
 		flags= SWS_PRINT_INFO;
461 462
 		firstTime=0;
463
+{/*
464
+		SwsVector *g= getGaussianVec(1.7, 2);
465
+		SwsVector *id= getIdentityVec();
466
+		scaleVec(g, 0.2);
467
+
468
+
469
+//		srcFilter.chrH= diffVec(id, g);
470
+//		srcFilter.chrH= shiftVec(id, 20);
471
+		srcFilter.chrH= g;
472
+//		freeVec(g);
473
+		freeVec(id);
474
+
475
+		normalizeVec(srcFilter.chrH, 1.0);
476
+		printVec(srcFilter.chrH);
477
+
478
+		srcFilter.lumV= srcFilter.lumH= srcFilter.chrV= srcFilter.chrH;
479
+		srcFilter.lumH = srcFilter.lumV = NULL;
480
+//		srcFilter.chrH = srcFilter.chrV = NULL;
481
+*/}
462 482
 	}
463 483
 
464 484
 	switch(dstbpp)
... ...
@@ -481,32 +501,40 @@ void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
481 481
 		default:flags|= SWS_BILINEAR; break;
482 482
 	}
483 483
 
484
-	if(!context) context=getSwsContext(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat, flags, NULL, NULL);
484
+	if(!context) context=getSwsContext(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat, flags, &srcFilter, NULL);
485 485
 
486 486
 
487 487
 	swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
488 488
 }
489 489
 
490
-static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
491
-			      int srcW, int dstW, int filterAlign, int one, int flags)
490
+static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
491
+			      int srcW, int dstW, int filterAlign, int one, int flags,
492
+			      SwsVector *srcFilter, SwsVector *dstFilter)
492 493
 {
493 494
 	int i;
494
-	double filter[10000];
495
+	int filterSize;
496
+	int filter2Size;
497
+	int minFilterSize;
498
+	double *filter=NULL;
499
+	double *filter2=NULL;
495 500
 #ifdef ARCH_X86
496 501
 	if(gCpuCaps.hasMMX)
497 502
 		asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
498 503
 #endif
499 504
 
505
+	*filterPos = (int16_t*)memalign(8, dstW*sizeof(int16_t));
506
+
500 507
 	if(ABS(xInc - 0x10000) <10) // unscaled
501 508
 	{
502 509
 		int i;
503
-		*filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
504
-		for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
510
+		filterSize= 1;
511
+		filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
512
+		for(i=0; i<dstW*filterSize; i++) filter[i]=0;
505 513
 
506 514
 		for(i=0; i<dstW; i++)
507 515
 		{
508
-			filter[i*(*filterSize)]=1;
509
-			filterPos[i]=i;
516
+			filter[i*filterSize]=1;
517
+			(*filterPos)[i]=i;
510 518
 		}
511 519
 
512 520
 	}
... ...
@@ -514,19 +542,19 @@ static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filte
514 514
 	{
515 515
 		int i;
516 516
 		int xDstInSrc;
517
-		if     (flags&SWS_BICUBIC) *filterSize= 4;
518
-		else if(flags&SWS_X      ) *filterSize= 4;
519
-		else			   *filterSize= 2;
517
+		if     (flags&SWS_BICUBIC) filterSize= 4;
518
+		else if(flags&SWS_X      ) filterSize= 4;
519
+		else			   filterSize= 2;
520 520
 //		printf("%d %d %d\n", filterSize, srcW, dstW);
521
-		*filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
521
+		filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
522 522
 
523 523
 		xDstInSrc= xInc/2 - 0x8000;
524 524
 		for(i=0; i<dstW; i++)
525 525
 		{
526
-			int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
526
+			int xx= (xDstInSrc>>16) - (filterSize>>1) + 1;
527 527
 			int j;
528 528
 
529
-			filterPos[i]= xx;
529
+			(*filterPos)[i]= xx;
530 530
 			if((flags & SWS_BICUBIC) || (flags & SWS_X))
531 531
 			{
532 532
 				double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
... ...
@@ -547,21 +575,21 @@ static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filte
547 547
 				}
548 548
 
549 549
 //				printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
550
-				filter[i*(*filterSize) + 0]= y1;
551
-				filter[i*(*filterSize) + 1]= y2;
552
-				filter[i*(*filterSize) + 2]= y3;
553
-				filter[i*(*filterSize) + 3]= y4;
550
+				filter[i*filterSize + 0]= y1;
551
+				filter[i*filterSize + 1]= y2;
552
+				filter[i*filterSize + 2]= y3;
553
+				filter[i*filterSize + 3]= y4;
554 554
 //				printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
555 555
 			}
556 556
 			else
557 557
 			{
558
-				for(j=0; j<*filterSize; j++)
558
+				for(j=0; j<filterSize; j++)
559 559
 				{
560 560
 					double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
561 561
 					double coeff= 1.0 - d;
562 562
 					if(coeff<0) coeff=0;
563 563
 	//				printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
564
-					filter[i*(*filterSize) + j]= coeff;
564
+					filter[i*filterSize + j]= coeff;
565 565
 					xx++;
566 566
 				}
567 567
 			}
... ...
@@ -571,19 +599,19 @@ static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filte
571 571
 	else // downscale
572 572
 	{
573 573
 		int xDstInSrc;
574
-		if(flags&SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
575
-		else if(flags&SWS_X)  *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
576
-		else		      *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
574
+		if(flags&SWS_BICUBIC) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
575
+		else if(flags&SWS_X)  filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
576
+		else		      filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
577 577
 //		printf("%d %d %d\n", *filterSize, srcW, dstW);
578
-		*filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
578
+		filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
579 579
 
580 580
 		xDstInSrc= xInc/2 - 0x8000;
581 581
 		for(i=0; i<dstW; i++)
582 582
 		{
583
-			int xx= (int)((double)xDstInSrc/(double)(1<<16) - ((*filterSize)-1)*0.5 + 0.5);
583
+			int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
584 584
 			int j;
585
-			filterPos[i]= xx;
586
-			for(j=0; j<*filterSize; j++)
585
+			(*filterPos)[i]= xx;
586
+			for(j=0; j<filterSize; j++)
587 587
 			{
588 588
 				double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
589 589
 				double coeff;
... ...
@@ -608,62 +636,155 @@ static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filte
608 608
 					if(coeff<0) coeff=0;
609 609
 				}
610 610
 //				printf("%1.3f %d %d \n", coeff, (int)d, xDstInSrc);
611
-				filter[i*(*filterSize) + j]= coeff;
611
+				filter[i*filterSize + j]= coeff;
612 612
 				xx++;
613 613
 			}
614 614
 			xDstInSrc+= xInc;
615 615
 		}
616 616
 	}
617 617
 
618
+	/* apply src & dst Filter to filter -> filter2
619
+	   free(filter);
620
+	*/
621
+	filter2Size= filterSize;
622
+	if(srcFilter) filter2Size+= srcFilter->length - 1;
623
+	if(dstFilter) filter2Size+= dstFilter->length - 1;
624
+	filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
625
+
626
+	for(i=0; i<dstW; i++)
627
+	{
628
+		int j;
629
+		SwsVector scaleFilter;
630
+		SwsVector *outVec;
631
+
632
+		scaleFilter.coeff= filter + i*filterSize;
633
+		scaleFilter.length= filterSize;
634
+
635
+		if(srcFilter) outVec= convVec(srcFilter, &scaleFilter);
636
+		else	      outVec= &scaleFilter;
637
+
638
+		ASSERT(outVec->length == filter2Size)
639
+		//FIXME dstFilter
640
+
641
+		for(j=0; j<outVec->length; j++)
642
+		{
643
+			filter2[i*filter2Size + j]= outVec->coeff[j];
644
+		}
645
+
646
+		(*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
647
+
648
+		if(outVec != &scaleFilter) freeVec(outVec);
649
+	}
650
+	free(filter); filter=NULL;
651
+
652
+	/* try to reduce the filter-size (step1 find size and shift left) */
653
+	// Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
654
+	minFilterSize= 0;
655
+	for(i=dstW-1; i>=0; i--)
656
+	{
657
+		int min= filter2Size;
658
+		int j;
659
+		double cutOff=0.0;
660
+
661
+		/* get rid off near zero elements on the left by shifting left */
662
+		for(j=0; j<filter2Size; j++)
663
+		{
664
+			int k;
665
+			cutOff += ABS(filter2[i*filter2Size]);
666
+
667
+			if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
668
+
669
+			/* preserve Monotonicity because the core cant handle the filter otherwise */
670
+			if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
671
+
672
+			// Move filter coeffs left
673
+			for(k=1; k<filter2Size; k++)
674
+				filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
675
+			filter2[i*filter2Size + k - 1]= 0.0;
676
+			(*filterPos)[i]++;
677
+		}
678
+
679
+		cutOff=0.0;
680
+		/* count near zeros on the right */
681
+		for(j=filter2Size-1; j>0; j--)
682
+		{
683
+			cutOff += ABS(filter2[i*filter2Size + j]);
684
+
685
+			if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
686
+			min--;
687
+		}
688
+
689
+		if(min>minFilterSize) minFilterSize= min;
690
+	}
691
+
692
+	/* try to reduce the filter-size (step2 reduce it) */
693
+	for(i=0; i<dstW; i++)
694
+	{
695
+		int j;
696
+
697
+		for(j=0; j<minFilterSize; j++)
698
+			filter2[i*minFilterSize + j]= filter2[i*filter2Size + j];
699
+	}
700
+	if((flags&SWS_PRINT_INFO) && verbose)
701
+		printf("SwScaler: reducing filtersize %d -> %d\n", filter2Size, minFilterSize);
702
+	filter2Size= minFilterSize;
703
+	ASSERT(filter2Size > 0)
704
+
705
+	//FIXME try to align filterpos if possible
706
+
618 707
 	//fix borders
619 708
 	for(i=0; i<dstW; i++)
620 709
 	{
621 710
 		int j;
622
-		if(filterPos[i] < 0)
711
+		if((*filterPos)[i] < 0)
623 712
 		{
624 713
 			// Move filter coeffs left to compensate for filterPos
625
-			for(j=1; j<*filterSize; j++)
714
+			for(j=1; j<filter2Size; j++)
626 715
 			{
627
-				int left= MAX(j + filterPos[i], 0);
628
-				filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
629
-				filter[i*(*filterSize) + j]=0;
716
+				int left= MAX(j + (*filterPos)[i], 0);
717
+				filter2[i*filter2Size + left] += filter2[i*filter2Size + j];
718
+				filter2[i*filter2Size + j]=0;
630 719
 			}
631
-			filterPos[i]= 0;
720
+			(*filterPos)[i]= 0;
632 721
 		}
633 722
 
634
-		if(filterPos[i] + (*filterSize) > srcW)
723
+		if((*filterPos)[i] + filter2Size > srcW)
635 724
 		{
636
-			int shift= filterPos[i] + (*filterSize) - srcW;
725
+			int shift= (*filterPos)[i] + filter2Size - srcW;
637 726
 			// Move filter coeffs right to compensate for filterPos
638
-			for(j=(*filterSize)-2; j>=0; j--)
727
+			for(j=filter2Size-2; j>=0; j--)
639 728
 			{
640
-				int right= MIN(j + shift, (*filterSize)-1);
641
-				filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
642
-				filter[i*(*filterSize) +j]=0;
729
+				int right= MIN(j + shift, filter2Size-1);
730
+				filter2[i*filter2Size +right] += filter2[i*filter2Size +j];
731
+				filter2[i*filter2Size +j]=0;
643 732
 			}
644
-			filterPos[i]= srcW - (*filterSize);
733
+			(*filterPos)[i]= srcW - filter2Size;
645 734
 		}
646 735
 	}
647 736
 
648
-	//FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
649
-	// and skip these than later
650 737
 
651
-	//Normalize
738
+	*outFilterSize= (filter2Size +(filterAlign-1)) & (~(filterAlign-1));
739
+	*outFilter= (int16_t*)memalign(8, *outFilterSize*dstW*sizeof(int16_t));
740
+	memset(*outFilter, 0, *outFilterSize*dstW*sizeof(int16_t));
741
+
742
+	/* Normalize & Store in outFilter */
652 743
 	for(i=0; i<dstW; i++)
653 744
 	{
654 745
 		int j;
655 746
 		double sum=0;
656 747
 		double scale= one;
657
-		for(j=0; j<*filterSize; j++)
748
+		for(j=0; j<filter2Size; j++)
658 749
 		{
659
-			sum+= filter[i*(*filterSize) + j];
750
+			sum+= filter2[i*filter2Size + j];
660 751
 		}
661 752
 		scale/= sum;
662
-		for(j=0; j<*filterSize; j++)
753
+		for(j=0; j<filter2Size; j++)
663 754
 		{
664
-			dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
755
+			(*outFilter)[i*(*outFilterSize) + j]= (int)(filter2[i*filter2Size + j]*scale);
665 756
 		}
666 757
 	}
758
+
759
+	free(filter2);
667 760
 }
668 761
 
669 762
 #ifdef ARCH_X86
... ...
@@ -822,18 +943,12 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
822 822
 	const int widthAlign= dstFormat==IMGFMT_YV12 ? 16 : 8;
823 823
 	SwsContext *c;
824 824
 	int i;
825
-//const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
826
-//const int over= dstFormat==IMGFMT_YV12 ? 	  (((dstW+15)&(~15))) - dststride
827
-//						: (((dstW+7)&(~7)))*bytespp - dststride;
825
+	SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
826
+
828 827
 	if(swScale==NULL) globalInit();
829 828
 
830 829
 	/* sanity check */
831 830
 	if(srcW<1 || srcH<1 || dstW<1 || dstH<1) return NULL;
832
-	if(srcW>=SWS_MAX_SIZE || dstW>=SWS_MAX_SIZE || srcH>=SWS_MAX_SIZE || dstH>=SWS_MAX_SIZE)
833
-	{
834
-		fprintf(stderr, "size is too large, increase SWS_MAX_SIZE\n");
835
-		return NULL;
836
-	}
837 831
 
838 832
 /* FIXME
839 833
 	if(dstStride[0]%widthAlign !=0 )
... ...
@@ -844,7 +959,11 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
844 844
 					widthAlign);
845 845
 	}
846 846
 */
847
+	if(!dstFilter) dstFilter= &dummyFilter;
848
+	if(!srcFilter) srcFilter= &dummyFilter;
849
+
847 850
 	c= memalign(64, sizeof(SwsContext));
851
+	memset(c, 0, sizeof(SwsContext));
848 852
 
849 853
 	c->srcW= srcW;
850 854
 	c->srcH= srcH;
... ...
@@ -895,10 +1014,12 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
895 895
 	{
896 896
 		const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
897 897
 
898
-		initFilter(c->hLumFilter, c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
899
-				 srcW      ,       dstW, filterAlign, 1<<14, flags);
900
-		initFilter(c->hChrFilter, c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
901
-				(srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags);
898
+		initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
899
+				 srcW      ,       dstW, filterAlign, 1<<14, flags,
900
+				 srcFilter->lumH, dstFilter->lumH);
901
+		initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
902
+				(srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
903
+				 srcFilter->chrH, dstFilter->chrH);
902 904
 
903 905
 #ifdef ARCH_X86
904 906
 // cant downscale !!!
... ...
@@ -913,10 +1034,12 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
913 913
 
914 914
 
915 915
 	/* precalculate vertical scaler filter coefficients */
916
-	initFilter(c->vLumFilter, c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
917
-			srcH      ,        dstH, 1, (1<<12)-4, flags);
918
-	initFilter(c->vChrFilter, c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
919
-			(srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags);
916
+	initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
917
+			srcH      ,        dstH, 1, (1<<12)-4, flags,
918
+			srcFilter->lumV, dstFilter->lumV);
919
+	initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
920
+			(srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
921
+			 srcFilter->chrV, dstFilter->chrV);
920 922
 
921 923
 	// Calculate Buffer Sizes so that they wont run out while handling these damn slices
922 924
 	c->vLumBufSize= c->vLumFilterSize;
... ...
@@ -935,6 +1058,8 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
935 935
 
936 936
 	// allocate pixbufs (we use dynamic allocation because otherwise we would need to
937 937
 	// allocate several megabytes to handle all possible cases)
938
+	c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
939
+	c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
938 940
 	for(i=0; i<c->vLumBufSize; i++)
939 941
 		c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
940 942
 	for(i=0; i<c->vChrBufSize; i++)
... ...
@@ -945,12 +1070,12 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
945 945
 	for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
946 946
 
947 947
 	ASSERT(c->chrDstH <= dstH)
948
-	ASSERT(c->vLumFilterSize*      dstH*4 <= SWS_MAX_SIZE*20)
949
-	ASSERT(c->vChrFilterSize*c->chrDstH*4 <= SWS_MAX_SIZE*20)
950 948
 
951 949
 	// pack filter data for mmx code
952 950
 	if(cpuCaps.hasMMX)
953 951
 	{
952
+		c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
953
+		c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
954 954
 		for(i=0; i<c->vLumFilterSize*dstH; i++)
955 955
 			c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
956 956
 				c->vLumFilter[i];
... ...
@@ -1064,11 +1189,16 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
1064 1064
  * returns a normalized gaussian curve used to filter stuff
1065 1065
  * quality=3 is high quality, lowwer is lowwer quality
1066 1066
  */
1067
-double *getGaussian(double variance, double quality){
1067
+
1068
+SwsVector *getGaussianVec(double variance, double quality){
1068 1069
 	const int length= (int)(variance*quality + 0.5) | 1;
1069 1070
 	int i;
1070 1071
 	double *coeff= memalign(sizeof(double), length*sizeof(double));
1071 1072
 	double middle= (length-1)*0.5;
1073
+	SwsVector *vec= malloc(sizeof(SwsVector));
1074
+
1075
+	vec->coeff= coeff;
1076
+	vec->length= length;
1072 1077
 
1073 1078
 	for(i=0; i<length; i++)
1074 1079
 	{
... ...
@@ -1076,51 +1206,201 @@ double *getGaussian(double variance, double quality){
1076 1076
 		coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1077 1077
 	}
1078 1078
 
1079
-	normalize(coeff, length, 1.0);
1080
-	return coeff;
1079
+	normalizeVec(vec, 1.0);
1080
+
1081
+	return vec;
1081 1082
 }
1082 1083
 
1083
-void normalize(double *coeff, int length, double height){
1084
+SwsVector *getIdentityVec(void){
1085
+	double *coeff= memalign(sizeof(double), sizeof(double));
1086
+	SwsVector *vec= malloc(sizeof(SwsVector));
1087
+	coeff[0]= 1.0;
1088
+
1089
+	vec->coeff= coeff;
1090
+	vec->length= 1;
1091
+
1092
+	return vec;
1093
+}
1094
+
1095
+void normalizeVec(SwsVector *a, double height){
1084 1096
 	int i;
1085 1097
 	double sum=0;
1086 1098
 	double inv;
1087 1099
 
1088
-	for(i=0; i<length; i++)
1089
-		sum+= coeff[i];
1100
+	for(i=0; i<a->length; i++)
1101
+		sum+= a->coeff[i];
1090 1102
 
1091 1103
 	inv= height/sum;
1092 1104
 
1093
-	for(i=0; i<length; i++)
1094
-		coeff[i]*= height;
1105
+	for(i=0; i<a->length; i++)
1106
+		a->coeff[i]*= height;
1095 1107
 }
1096 1108
 
1097
-double *conv(double *a, int aLength, double *b, int bLength){
1098
-	int length= aLength + bLength - 1;
1109
+void scaleVec(SwsVector *a, double scalar){
1110
+	int i;
1111
+
1112
+	for(i=0; i<a->length; i++)
1113
+		a->coeff[i]*= scalar;
1114
+}
1115
+
1116
+SwsVector *convVec(SwsVector *a, SwsVector *b){
1117
+	int length= a->length + b->length - 1;
1099 1118
 	double *coeff= memalign(sizeof(double), length*sizeof(double));
1100 1119
 	int i, j;
1120
+	SwsVector *vec= malloc(sizeof(SwsVector));
1121
+
1122
+	vec->coeff= coeff;
1123
+	vec->length= length;
1101 1124
 
1102 1125
 	for(i=0; i<length; i++) coeff[i]= 0.0;
1103 1126
 
1104
-	for(i=0; i<aLength; i++)
1127
+	for(i=0; i<a->length; i++)
1105 1128
 	{
1106
-		for(j=0; j<bLength; j++)
1129
+		for(j=0; j<b->length; j++)
1107 1130
 		{
1108
-			coeff[i+j]+= a[i]*b[j];
1131
+			coeff[i+j]+= a->coeff[i]*b->coeff[j];
1109 1132
 		}
1110 1133
 	}
1111 1134
 
1112
-	return coeff;
1135
+	return vec;
1113 1136
 }
1114 1137
 
1115
-/*
1116
-double *sum(double *a, int aLength, double *b, int bLength){
1117
-	int length= MAX(aLength, bLength);
1138
+SwsVector *sumVec(SwsVector *a, SwsVector *b){
1139
+	int length= MAX(a->length, b->length);
1118 1140
 	double *coeff= memalign(sizeof(double), length*sizeof(double));
1119 1141
 	int i;
1142
+	SwsVector *vec= malloc(sizeof(SwsVector));
1143
+
1144
+	vec->coeff= coeff;
1145
+	vec->length= length;
1120 1146
 
1121 1147
 	for(i=0; i<length; i++) coeff[i]= 0.0;
1122 1148
 
1123
-	for(i=0; i<aLength; i++) coeff[i]+= a[i];
1149
+	for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1150
+	for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1151
+
1152
+	return vec;
1124 1153
 }
1125
-*/
1154
+
1155
+SwsVector *diffVec(SwsVector *a, SwsVector *b){
1156
+	int length= MAX(a->length, b->length);
1157
+	double *coeff= memalign(sizeof(double), length*sizeof(double));
1158
+	int i;
1159
+	SwsVector *vec= malloc(sizeof(SwsVector));
1160
+
1161
+	vec->coeff= coeff;
1162
+	vec->length= length;
1163
+
1164
+	for(i=0; i<length; i++) coeff[i]= 0.0;
1165
+
1166
+	for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1167
+	for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1168
+
1169
+	return vec;
1170
+}
1171
+
1172
+/* shift left / or right if "shift" is negative */
1173
+SwsVector *shiftVec(SwsVector *a, int shift){
1174
+	int length= a->length + ABS(shift)*2;
1175
+	double *coeff= memalign(sizeof(double), length*sizeof(double));
1176
+	int i, j;
1177
+	SwsVector *vec= malloc(sizeof(SwsVector));
1178
+
1179
+	vec->coeff= coeff;
1180
+	vec->length= length;
1181
+
1182
+	for(i=0; i<length; i++) coeff[i]= 0.0;
1183
+
1184
+	for(i=0; i<a->length; i++)
1185
+	{
1186
+		coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1187
+	}
1188
+
1189
+	return vec;
1190
+}
1191
+
1192
+void printVec(SwsVector *a){
1193
+	int i;
1194
+	double max=0;
1195
+	double min=0;
1196
+	double range;
1197
+
1198
+	for(i=0; i<a->length; i++)
1199
+		if(a->coeff[i]>max) max= a->coeff[i];
1200
+
1201
+	for(i=0; i<a->length; i++)
1202
+		if(a->coeff[i]<min) min= a->coeff[i];
1203
+
1204
+	range= max - min;
1205
+
1206
+	for(i=0; i<a->length; i++)
1207
+	{
1208
+		int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1209
+		printf("%1.3f ", a->coeff[i]);
1210
+		for(;x>0; x--) printf(" ");
1211
+		printf("|\n");
1212
+	}
1213
+}
1214
+
1215
+void freeVec(SwsVector *a){
1216
+	if(!a) return;
1217
+	if(a->coeff) free(a->coeff);
1218
+	a->coeff=NULL;
1219
+	a->length=0;
1220
+	free(a);
1221
+}
1222
+
1223
+void freeSwsContext(SwsContext *c){
1224
+	int i;
1225
+
1226
+	if(!c) return;
1227
+
1228
+	if(c->lumPixBuf)
1229
+	{
1230
+		for(i=0; i<c->vLumBufSize*2; i++)
1231
+		{
1232
+			if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1233
+			c->lumPixBuf[i]=NULL;
1234
+		}
1235
+		free(c->lumPixBuf);
1236
+		c->lumPixBuf=NULL;
1237
+	}
1238
+
1239
+	if(c->chrPixBuf)
1240
+	{
1241
+		for(i=0; i<c->vChrBufSize*2; i++)
1242
+		{
1243
+			if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1244
+			c->chrPixBuf[i]=NULL;
1245
+		}
1246
+		free(c->chrPixBuf);
1247
+		c->chrPixBuf=NULL;
1248
+	}
1249
+
1250
+	if(c->vLumFilter) free(c->vLumFilter);
1251
+	c->vLumFilter = NULL;
1252
+	if(c->vChrFilter) free(c->vChrFilter);
1253
+	c->vChrFilter = NULL;
1254
+	if(c->hLumFilter) free(c->hLumFilter);
1255
+	c->hLumFilter = NULL;
1256
+	if(c->hChrFilter) free(c->hChrFilter);
1257
+	c->hChrFilter = NULL;
1258
+
1259
+	if(c->vLumFilterPos) free(c->vLumFilterPos);
1260
+	c->vLumFilterPos = NULL;
1261
+	if(c->vChrFilterPos) free(c->vChrFilterPos);
1262
+	c->vChrFilterPos = NULL;
1263
+	if(c->hLumFilterPos) free(c->hLumFilterPos);
1264
+	c->hLumFilterPos = NULL;
1265
+	if(c->hChrFilterPos) free(c->hChrFilterPos);
1266
+	c->hChrFilterPos = NULL;
1267
+
1268
+	if(c->lumMmxFilter) free(c->lumMmxFilter);
1269
+	c->lumMmxFilter = NULL;
1270
+	if(c->chrMmxFilter) free(c->chrMmxFilter);
1271
+	c->chrMmxFilter = NULL;
1272
+
1273
+	free(c);
1274
+}
1275
+
1126 1276
 
... ...
@@ -7,7 +7,7 @@
7 7
 #define SWS_FULL_UV_IPOL 0x100
8 8
 #define SWS_PRINT_INFO 0x1000
9 9
 
10
-#define SWS_MAX_SIZE 2000
10
+#define SWS_MAX_REDUCE_CUTOFF 0.002
11 11
 
12 12
 /* this struct should be aligned on at least 32-byte boundary */
13 13
 typedef struct{
... ...
@@ -16,20 +16,21 @@ typedef struct{
16 16
 	int lumXInc, chrXInc;
17 17
 	int lumYInc, chrYInc;
18 18
 	int dstFormat, srcFormat;
19
-	int16_t __attribute__((aligned(8))) *lumPixBuf[SWS_MAX_SIZE];
20
-	int16_t __attribute__((aligned(8))) *chrPixBuf[SWS_MAX_SIZE];
21
-	int16_t __attribute__((aligned(8))) hLumFilter[SWS_MAX_SIZE*5];
22
-	int16_t __attribute__((aligned(8))) hLumFilterPos[SWS_MAX_SIZE];
23
-	int16_t __attribute__((aligned(8))) hChrFilter[SWS_MAX_SIZE*5];
24
-	int16_t __attribute__((aligned(8))) hChrFilterPos[SWS_MAX_SIZE];
25
-	int16_t __attribute__((aligned(8))) vLumFilter[SWS_MAX_SIZE*5];
26
-	int16_t __attribute__((aligned(8))) vLumFilterPos[SWS_MAX_SIZE];
27
-	int16_t __attribute__((aligned(8))) vChrFilter[SWS_MAX_SIZE*5];
28
-	int16_t __attribute__((aligned(8))) vChrFilterPos[SWS_MAX_SIZE];
19
+
20
+	int16_t **lumPixBuf;
21
+	int16_t **chrPixBuf;
22
+	int16_t *hLumFilter;
23
+	int16_t *hLumFilterPos;
24
+	int16_t *hChrFilter;
25
+	int16_t *hChrFilterPos;
26
+	int16_t *vLumFilter;
27
+	int16_t *vLumFilterPos;
28
+	int16_t *vChrFilter;
29
+	int16_t *vChrFilterPos;
29 30
 
30 31
 // Contain simply the values from v(Lum|Chr)Filter just nicely packed for mmx
31
-	int16_t __attribute__((aligned(8))) lumMmxFilter[SWS_MAX_SIZE*20];
32
-	int16_t __attribute__((aligned(8))) chrMmxFilter[SWS_MAX_SIZE*20];
32
+	int16_t  *lumMmxFilter;
33
+	int16_t  *chrMmxFilter;
33 34
 
34 35
 	int hLumFilterSize;
35 36
 	int hChrFilterSize;
... ...
@@ -52,12 +53,19 @@ typedef struct{
52 52
 } SwsContext;
53 53
 //FIXME check init (where 0)
54 54
 
55
+// when used for filters they must have an odd number of elements
56
+// coeffs cannot be shared between vectors
55 57
 typedef struct {
56
-	double *lumH;
57
-	double *lumV;
58
-	double *chrH;
59
-	double *chrV;
58
+	double *coeff;
60 59
 	int length;
60
+} SwsVector;
61
+
62
+// vectors can be shared
63
+typedef struct {
64
+	SwsVector *lumH;
65
+	SwsVector *lumV;
66
+	SwsVector *chrH;
67
+	SwsVector *chrV;
61 68
 } SwsFilter;
62 69
 
63 70
 
... ...
@@ -74,7 +82,7 @@ void SwScale_Init();
74 74
 
75 75
 
76 76
 
77
-void freeSwsContext(SwsContext swsContext);
77
+void freeSwsContext(SwsContext *swsContext);
78 78
 
79 79
 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
80 80
 			 SwsFilter *srcFilter, SwsFilter *dstFilter);
... ...
@@ -82,9 +90,15 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
82 82
 extern void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
83 83
              int srcSliceH, uint8_t* dst[], int dstStride[]);
84 84
 
85
-double *getGaussian(double variance, double quality);
86
-
87
-void normalize(double *coeff, int length, double height);
88
-
89
-double *conv(double *a, int aLength, double *b, int bLength);
85
+SwsVector *getGaussianVec(double variance, double quality);
86
+SwsVector *getIdentityVec(void);
87
+void scaleVec(SwsVector *a, double scalar);
88
+void normalizeVec(SwsVector *a, double height);
89
+SwsVector *convVec(SwsVector *a, SwsVector *b);
90
+SwsVector *sumVec(SwsVector *a, SwsVector *b);
91
+SwsVector *diffVec(SwsVector *a, SwsVector *b);
92
+SwsVector *shiftVec(SwsVector *a, int shift);
93
+
94
+void printVec(SwsVector *a);
95
+void freeVec(SwsVector *a);
90 96
 
... ...
@@ -1935,13 +1935,10 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int
1935 1935
 		const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1936 1936
 		const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1937 1937
 
1938
-		if(flags&SWS_FAST_BILINEAR)
1939
-		{
1940
-			//handle holes
1941
-			if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1942
-			if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1943
-		}
1944
-
1938
+		//handle holes (FAST_BILINEAR & weird filters)
1939
+		if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1940
+		if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1941
+//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
1945 1942
 		ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
1946 1943
 		ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
1947 1944
 
... ...
@@ -1953,6 +1950,7 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int
1953 1953
 			{
1954 1954
 				uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1955 1955
 				lumBufIndex++;
1956
+//				printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
1956 1957
 				ASSERT(lumBufIndex < 2*vLumBufSize)
1957 1958
 				ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
1958 1959
 				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)