svn commit: r1798533 - in /tomcat/trunk: java/org/apache/catalina/valves/CrawlerSessionManagerValve.java test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java webapps/docs/changelog.xml webapps/docs/config/valve.xml

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

svn commit: r1798533 - in /tomcat/trunk: java/org/apache/catalina/valves/CrawlerSessionManagerValve.java test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java webapps/docs/changelog.xml webapps/docs/config/valve.xml

violetagg
Author: violetagg
Date: Mon Jun 12 21:04:53 2017
New Revision: 1798533

URL: http://svn.apache.org/viewvc?rev=1798533&view=rev
Log:
A new configuration property 'crawlerIps' is added to the 'o.a.catalina.valves.CrawlerSessionManagerValve'. Using this property one can specify a regular expression that will be used to identify crawlers based on their IP address. Based on a patch provided by Tetradeus via GitHub.

Added:
    tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java   (with props)
Modified:
    tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
    tomcat/trunk/webapps/docs/changelog.xml
    tomcat/trunk/webapps/docs/config/valve.xml

Modified: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
URL: http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java?rev=1798533&r1=1798532&r2=1798533&view=diff
==============================================================================
--- tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java (original)
+++ tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java Mon Jun 12 21:04:53 2017
@@ -49,6 +49,10 @@ public class CrawlerSessionManagerValve
 
     private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*";
     private Pattern uaPattern = null;
+
+    private String crawlerIps = null;
+    private Pattern ipPattern = null;
+
     private int sessionInactiveInterval = 60;
 
 
@@ -86,6 +90,31 @@ public class CrawlerSessionManagerValve
 
 
     /**
+     * Specify the regular expression (using {@link Pattern}) that will be used
+     * to identify crawlers based on their IP address. The default is no crawler
+     * IPs.
+     *
+     * @param crawlerIps The regular expression using {@link Pattern}
+     */
+    public void setCrawlerIps(String crawlerIps) {
+        this.crawlerIps = crawlerIps;
+        if (crawlerIps == null || crawlerIps.length() == 0) {
+            ipPattern = null;
+        } else {
+            ipPattern = Pattern.compile(crawlerIps);
+        }
+    }
+
+    /**
+     * @see #setCrawlerIps(String)
+     * @return The current regular expression being used to match IP addresses.
+     */
+    public String getCrawlerIps() {
+        return crawlerIps;
+    }
+
+
+    /**
      * Specify the session timeout (in seconds) for a crawler's session. This is
      * typically lower than that for a user session. The default is 60 seconds.
      *
@@ -122,11 +151,11 @@ public class CrawlerSessionManagerValve
 
         boolean isBot = false;
         String sessionId = null;
-        String clientIp = null;
+        String clientIp = request.getRemoteAddr();
 
         if (log.isDebugEnabled()) {
-            log.debug(request.hashCode() + ": ClientIp=" + request.getRemoteAddr()
-                    + ", RequestedSessionId=" + request.getRequestedSessionId());
+            log.debug(request.hashCode() + ": ClientIp=" + clientIp + ", RequestedSessionId="
+                    + request.getRequestedSessionId());
         }
 
         // If the incoming request has a valid session ID, no action is required
@@ -155,9 +184,16 @@ public class CrawlerSessionManagerValve
                 }
             }
 
+            if (ipPattern != null && ipPattern.matcher(clientIp).matches()) {
+                isBot = true;
+
+                if (log.isDebugEnabled()) {
+                    log.debug(request.hashCode() + ": Bot found. IP=" + clientIp);
+                }
+            }
+
             // If this is a bot, is the session ID known?
             if (isBot) {
-                clientIp = request.getRemoteAddr();
                 sessionId = clientIpSessionId.get(clientIp);
                 if (sessionId != null) {
                     request.setRequestedSessionId(sessionId);

Added: tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
URL: http://svn.apache.org/viewvc/tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java?rev=1798533&view=auto
==============================================================================
--- tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java (added)
+++ tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java Mon Jun 12 21:04:53 2017
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.catalina.valves;
+
+import java.util.Collections;
+
+import javax.servlet.http.HttpSession;
+
+import org.junit.Test;
+
+import org.apache.catalina.Valve;
+import org.apache.catalina.connector.Request;
+import org.apache.catalina.connector.Response;
+import org.easymock.EasyMock;
+import org.easymock.IExpectationSetters;
+
+public class TestCrawlerSessionManagerValve {
+
+    @Test
+    public void testCrawlerIpsPositive() throws Exception {
+        CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
+        valve.setCrawlerIps("216\\.58\\.206\\.174");
+        valve.setNext(EasyMock.createMock(Valve.class));
+        HttpSession session = createSessionExpectations(valve, true);
+        Request request = createRequestExpectations("216.58.206.174", session, true);
+
+        EasyMock.replay(request, session);
+
+        valve.invoke(request, EasyMock.createMock(Response.class));
+
+        EasyMock.verify(request, session);
+    }
+
+    @Test
+    public void testCrawlerIpsNegative() throws Exception {
+        CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
+        valve.setCrawlerIps("216\\.58\\.206\\.174");
+        valve.setNext(EasyMock.createMock(Valve.class));
+        HttpSession session = createSessionExpectations(valve, false);
+        Request request = createRequestExpectations("127.0.0.1", session, false);
+
+        EasyMock.replay(request, session);
+
+        valve.invoke(request, EasyMock.createMock(Response.class));
+
+        EasyMock.verify(request, session);
+    }
+
+    private HttpSession createSessionExpectations(CrawlerSessionManagerValve valve, boolean isBot) {
+        HttpSession session = EasyMock.createMock(HttpSession.class);
+        if (isBot) {
+            EasyMock.expect(session.getId()).andReturn("id").times(2);
+            session.setAttribute(valve.getClass().getName(), valve);
+            EasyMock.expectLastCall();
+            session.setMaxInactiveInterval(60);
+            EasyMock.expectLastCall();
+        }
+        return session;
+    }
+
+    private Request createRequestExpectations(String ip, HttpSession session, boolean isBot) {
+        Request request = EasyMock.createMock(Request.class);
+        EasyMock.expect(request.getRemoteAddr()).andReturn(ip);
+        IExpectationSetters<HttpSession> setter = EasyMock.expect(request.getSession(false))
+                .andReturn(null);
+        if (isBot) {
+            setter.andReturn(session);
+        }
+        EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.emptyEnumeration());
+        return request;
+    }
+}

Propchange: tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tomcat/trunk/webapps/docs/changelog.xml
URL: http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/changelog.xml?rev=1798533&r1=1798532&r2=1798533&view=diff
==============================================================================
--- tomcat/trunk/webapps/docs/changelog.xml (original)
+++ tomcat/trunk/webapps/docs/changelog.xml Mon Jun 12 21:04:53 2017
@@ -100,6 +100,13 @@
         <code>o.a.catalina.startup.Tomcat</code>. Patch provided by
         peterhansson_se. (violetagg)
       </fix>
+      <add>
+        A new configuration property <code>crawlerIps</code> is added to the
+        <code>o.a.catalina.valves.CrawlerSessionManagerValve</code>. Using this
+        property one can specify a regular expression that will be used to
+        identify crawlers based on their IP address. Based on a patch provided
+        by Tetradeus. (violetagg)
+      </add>
     </changelog>
   </subsection>
   <subsection name="Coyote">

Modified: tomcat/trunk/webapps/docs/config/valve.xml
URL: http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/config/valve.xml?rev=1798533&r1=1798532&r2=1798533&view=diff
==============================================================================
--- tomcat/trunk/webapps/docs/config/valve.xml (original)
+++ tomcat/trunk/webapps/docs/config/valve.xml Mon Jun 12 21:04:53 2017
@@ -1651,6 +1651,12 @@
         </p>
       </attribute>
 
+      <attribute name="crawlerIps" required="false">
+        <p>Regular expression (using <code>java.util.regex</code>) that client
+        IP is matched against to determine if a request is from a web crawler.
+        By default such regular expression is not set.</p>
+      </attribute>
+
       <attribute name="crawlerUserAgents" required="false">
         <p>Regular expression (using <code>java.util.regex</code>) that the user
         agent HTTP request header is matched against to determine if a request



---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]