| 
				
			 | 
			
			
				@@ -18,7 +18,17 @@ def result_content_len(content): 
			 | 
		
	
		
			
			| 
				18
			 | 
			
				18
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				19
			 | 
			
				19
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				20
			 | 
			
				20
			 | 
			
			
				 def compare_urls(url_a, url_b): 
			 | 
		
	
		
			
			| 
				21
			 | 
			
				
			 | 
			
			
				-    if url_a.netloc != url_b.netloc or url_a.query != url_b.query: 
			 | 
		
	
		
			
			| 
				
			 | 
			
				21
			 | 
			
			
				+    # ignore www. in comparison 
			 | 
		
	
		
			
			| 
				
			 | 
			
				22
			 | 
			
			
				+    if url_a.netloc.startswith('www.'): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				23
			 | 
			
			
				+        host_a = url_a.netloc.replace('www.', '', 1) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				24
			 | 
			
			
				+    else: 
			 | 
		
	
		
			
			| 
				
			 | 
			
				25
			 | 
			
			
				+        host_a = url_a.netloc 
			 | 
		
	
		
			
			| 
				
			 | 
			
				26
			 | 
			
			
				+    if url_b.netloc.startswith('www.'): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				27
			 | 
			
			
				+        host_b = url_b.netloc.replace('www.', '', 1) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				28
			 | 
			
			
				+    else: 
			 | 
		
	
		
			
			| 
				
			 | 
			
				29
			 | 
			
			
				+        host_b = url_b.netloc 
			 | 
		
	
		
			
			| 
				
			 | 
			
				30
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				31
			 | 
			
			
				+    if host_a != host_b or url_a.query != url_b.query: 
			 | 
		
	
		
			
			| 
				22
			 | 
			
				32
			 | 
			
			
				         return False 
			 | 
		
	
		
			
			| 
				23
			 | 
			
				33
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				24
			 | 
			
				34
			 | 
			
			
				     # remove / from the end of the url if required 
			 | 
		
	
	
		
			
			| 
				
			 | 
			
			
				@@ -37,15 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2): 
			 | 
		
	
		
			
			| 
				37
			 | 
			
				47
			 | 
			
			
				         urls1 = infobox1.get('urls', None) 
			 | 
		
	
		
			
			| 
				38
			 | 
			
				48
			 | 
			
			
				         if urls1 is None: 
			 | 
		
	
		
			
			| 
				39
			 | 
			
				49
			 | 
			
			
				             urls1 = [] 
			 | 
		
	
		
			
			| 
				40
			 | 
			
				
			 | 
			
			
				-            infobox1['urls'] = urls1 
			 | 
		
	
		
			
			| 
				41
			 | 
			
				50
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				42
			 | 
			
				
			 | 
			
			
				-        urlSet = set() 
			 | 
		
	
		
			
			| 
				43
			 | 
			
				
			 | 
			
			
				-        for url in infobox1.get('urls', []): 
			 | 
		
	
		
			
			| 
				44
			 | 
			
				
			 | 
			
			
				-            urlSet.add(url.get('url', None)) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				51
			 | 
			
			
				+        for url2 in infobox2.get('urls', []): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				52
			 | 
			
			
				+            unique_url = True 
			 | 
		
	
		
			
			| 
				
			 | 
			
				53
			 | 
			
			
				+            for url1 in infobox1.get('urls', []): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				54
			 | 
			
			
				+                if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				55
			 | 
			
			
				+                    unique_url = False 
			 | 
		
	
		
			
			| 
				
			 | 
			
				56
			 | 
			
			
				+                    break 
			 | 
		
	
		
			
			| 
				
			 | 
			
				57
			 | 
			
			
				+            if unique_url: 
			 | 
		
	
		
			
			| 
				
			 | 
			
				58
			 | 
			
			
				+                urls1.append(url2) 
			 | 
		
	
		
			
			| 
				45
			 | 
			
				59
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				46
			 | 
			
				
			 | 
			
			
				-        for url in infobox2.get('urls', []): 
			 | 
		
	
		
			
			| 
				47
			 | 
			
				
			 | 
			
			
				-            if url.get('url', None) not in urlSet: 
			 | 
		
	
		
			
			| 
				48
			 | 
			
				
			 | 
			
			
				-                urls1.append(url) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				60
			 | 
			
			
				+        infobox1['urls'] = urls1 
			 | 
		
	
		
			
			| 
				49
			 | 
			
				61
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				50
			 | 
			
				62
			 | 
			
			
				     if 'img_src' in infobox2: 
			 | 
		
	
		
			
			| 
				51
			 | 
			
				63
			 | 
			
			
				         img1 = infobox1.get('img_src', None) 
			 | 
		
	
	
		
			
			| 
				
			 | 
			
			
				@@ -97,7 +109,6 @@ class ResultContainer(object): 
			 | 
		
	
		
			
			| 
				97
			 | 
			
				109
			 | 
			
			
				         self.results = defaultdict(list) 
			 | 
		
	
		
			
			| 
				98
			 | 
			
				110
			 | 
			
			
				         self._merged_results = [] 
			 | 
		
	
		
			
			| 
				99
			 | 
			
				111
			 | 
			
			
				         self.infoboxes = [] 
			 | 
		
	
		
			
			| 
				100
			 | 
			
				
			 | 
			
			
				-        self._infobox_ids = {} 
			 | 
		
	
		
			
			| 
				101
			 | 
			
				112
			 | 
			
			
				         self.suggestions = set() 
			 | 
		
	
		
			
			| 
				102
			 | 
			
				113
			 | 
			
			
				         self.answers = set() 
			 | 
		
	
		
			
			| 
				103
			 | 
			
				114
			 | 
			
			
				         self._number_of_results = [] 
			 | 
		
	
	
		
			
			| 
				
			 | 
			
			
				@@ -138,14 +149,13 @@ class ResultContainer(object): 
			 | 
		
	
		
			
			| 
				138
			 | 
			
				149
			 | 
			
			
				         add_infobox = True 
			 | 
		
	
		
			
			| 
				139
			 | 
			
				150
			 | 
			
			
				         infobox_id = infobox.get('id', None) 
			 | 
		
	
		
			
			| 
				140
			 | 
			
				151
			 | 
			
			
				         if infobox_id is not None: 
			 | 
		
	
		
			
			| 
				141
			 | 
			
				
			 | 
			
			
				-            existingIndex = self._infobox_ids.get(infobox_id, None) 
			 | 
		
	
		
			
			| 
				142
			 | 
			
				
			 | 
			
			
				-            if existingIndex is not None: 
			 | 
		
	
		
			
			| 
				143
			 | 
			
				
			 | 
			
			
				-                merge_two_infoboxes(self.infoboxes[existingIndex], infobox) 
			 | 
		
	
		
			
			| 
				144
			 | 
			
				
			 | 
			
			
				-                add_infobox = False 
			 | 
		
	
		
			
			| 
				
			 | 
			
				152
			 | 
			
			
				+            for existingIndex in self.infoboxes: 
			 | 
		
	
		
			
			| 
				
			 | 
			
				153
			 | 
			
			
				+                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				154
			 | 
			
			
				+                    merge_two_infoboxes(existingIndex, infobox) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				155
			 | 
			
			
				+                    add_infobox = False 
			 | 
		
	
		
			
			| 
				145
			 | 
			
				156
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				146
			 | 
			
				157
			 | 
			
			
				         if add_infobox: 
			 | 
		
	
		
			
			| 
				147
			 | 
			
				158
			 | 
			
			
				             self.infoboxes.append(infobox) 
			 | 
		
	
		
			
			| 
				148
			 | 
			
				
			 | 
			
			
				-            self._infobox_ids[infobox_id] = len(self.infoboxes) - 1 
			 | 
		
	
		
			
			| 
				149
			 | 
			
				159
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				150
			 | 
			
				160
			 | 
			
			
				     def _merge_result(self, result, position): 
			 | 
		
	
		
			
			| 
				151
			 | 
			
				161
			 | 
			
			
				         result['parsed_url'] = urlparse(result['url']) 
			 | 
		
	
	
		
			
			| 
				
			 | 
			
			
				@@ -155,11 +165,6 @@ class ResultContainer(object): 
			 | 
		
	
		
			
			| 
				155
			 | 
			
				165
			 | 
			
			
				             result['parsed_url'] = result['parsed_url']._replace(scheme="http") 
			 | 
		
	
		
			
			| 
				156
			 | 
			
				166
			 | 
			
			
				             result['url'] = result['parsed_url'].geturl() 
			 | 
		
	
		
			
			| 
				157
			 | 
			
				167
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				158
			 | 
			
				
			 | 
			
			
				-        result['host'] = result['parsed_url'].netloc 
			 | 
		
	
		
			
			| 
				159
			 | 
			
				
			 | 
			
			
				- 
			 | 
		
	
		
			
			| 
				160
			 | 
			
				
			 | 
			
			
				-        if result['host'].startswith('www.'): 
			 | 
		
	
		
			
			| 
				161
			 | 
			
				
			 | 
			
			
				-            result['host'] = result['host'].replace('www.', '', 1) 
			 | 
		
	
		
			
			| 
				162
			 | 
			
				
			 | 
			
			
				- 
			 | 
		
	
		
			
			| 
				163
			 | 
			
				168
			 | 
			
			
				         result['engines'] = [result['engine']] 
			 | 
		
	
		
			
			| 
				164
			 | 
			
				169
			 | 
			
			
				  
			 | 
		
	
		
			
			| 
				165
			 | 
			
				170
			 | 
			
			
				         # strip multiple spaces and cariage returns from content 
			 |