#!/usr/bin/env perl
#
# Copyright (c) 2004, 2008 The NetBSD Foundation, Inc.
# All rights reserved.
#
# This code is derived from software contributed to The NetBSD Foundation
# by Hubert Feyrer <hubert@feyrer.de>.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 
# THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

#
# Extract BSD-mandated copyright messages for NetBSD documentation
#
# Usage:
# 1) find src xsrc -type f -print \
#    | perl extract-contrib-string.pl
#    >x
#
# 2) merge text after "--------" in "x" into
#    src/distrib/notes/common/legal.common
#
# Options:
#
#     perl extract-contrib-string.pl [-d] [-h] [-x] [-?]
#
# where
#     -d  debug output
#     -h  html output
#     -x  xml/docbook output
#     -?  display help/usage message


$ack_line1='([aA]ll( commercial)?( marketing or)? advertising materials mentioning( features)?'
    .      '|\d\. Redistributions of any form whatsoever)';
$ack_line2='(display the( following)?( acknowledge?ment)?|acknowledge?ment:$)';
$ack_endline=
      '(\d\.\s*(Neither the name'
    .         '|The name of the company nor the name'	# Wasn't my idea
    .         '|The name of the author may not'
    .         '|The name of .* must not be used to endorse'
    .         '|The names? (of )?.* nor the names? of'
    .         '|The names? (of )?.* or any of it\'?s members'
    .         '|Redistributions of any form whatsoever'
    .         '|The names .*"OpenSSL Toolkit.*" and .*"OpenSSL Project.*" must not be used'
    .         "|Urbana-Champaign Independent Media Center's name"
    . '))'
    .'|(^Neither the name)'
    .'|(THIS SOFTWARE IS PROVIDED)'
    .'|(ALL WARRANTIES WITH REGARD)'
    .'|(The word \'cryptographic\' can be left out if)'
    .'|(may be used to endorse)'
    .'|(@end cartouche)'
    .'|(</para>)'
    .'|(Redistribution and use in source and binary forms)'
    .'|(may not be used to endorse)'
    .'|(\.IP 4)'
    .'|(ALLOW FREE USE OF)'
    .'|(materials provided with the distribution)'
    .'|(@InsertRedistribution@)';

$known_bad_clause_3_wording=
      'usr.bin/lex/.*'				# UCB
    .'|dist/bind/contrib/nslint-2.1a3/lbl/.*'	#
    .'|usr.sbin/traceroute/ifaddrlist.h'	#
    .'|usr.sbin/traceroute/traceroute.c'	#
    .'|usr.sbin/hilinfo/hilinfo.c'	   	# CSS @ Utah
    ;	

sub warning {
    local($fn,$msg) = @_;
    print "XXX $fn line $.: $msg\n"
}

while ($#ARGV >= 0) {
    $debug=1 if ($ARGV[0] =~ /-d/i);
    $html=1  if ($ARGV[0] =~ /-h/i);
    $xml=1  if ($ARGV[0] =~ /-x/i);
    $usage=1  if ($ARGV[0] =~ /-\?/);
    shift(@ARGV);
}

if ($usage) {
    print "usage: find /usr/src -type f -print |\n" .
	" perl extract-contrib-string.pl [-h] [-x] [-?] [-d]\n" .
	"   where\n" .
	"    -h   output html\n" .
	"    -x   output xml/docbook\n" .
	"    -d   debug\n" .
	"    -?   display this help message\n";
    exit(0);
}

$comments = !$html && !$xml;

file:
while(<>) {
    chomp();
    $fn=$_;
    
    open(F, "$fn") || die "cannot read $fn: $!\n";

  line:
    while(<F>) {
	if (0 and /$ack_line2/in){
	    print "?> $_" if $debug;
	    
	    if ($fn !~ m,$known_bad_clause_3_wording,) {
		warning($fn, "clause 3 start not caught");
	    }
	    last line;
	}
	
	print "0> $_" if $debug;

	# special case perl script generating a license (openssl's
	# mkerr.pl) - ignore the quoted license, there is another one
	# inside:
	if (/^\"\s\*.*$ack_line1.*\\n\"\,/n) {
		while(!/$ack_endline/in) {
		    print "S> $_" if $debug;
		    $_ = <F>;
		}
	}

	if (/$ack_line1/in
	    or (/$ack_line2/n and $fn =~ m,$known_bad_clause_3_wording,)) {
	    
	    print "1> $_" if $debug;

	    $_=<F>
		unless $fn =~ m,$known_bad_clause_3_wording,;
	    if (/$ack_line2/in or $fn =~ m,$known_bad_clause_3_wording,){
		
		print "2> $_" if $debug;
		
		$msg="";

		if ($fn =~ m,$known_bad_clause_3_wording, and /``/) {
		    $msg = $_;
		}
		elsif (/:\s+This product/) {
		    # src/sys/lib/libkern/rngtest.c - bad clause 3 wording
		    # that is not like others, so special case it here
		    $msg = $_;
		    $msg =~ s/^.*:\s+(This product.*)$/$1/;
		}

		$cnt=0;
		$_=<F>;
		while(!/$ack_endline/in) {
		    
		    print "C> $_" if $debug;

		    $msg .= $_;
		    $cnt++;
		    $_ = <F>;
		    if ($cnt > 10) {
			warning($fn,"loooong copyright?");
			last line;
		    }
		}

		print "E> $_" if $debug;
		
		# post-process

		if ($fn =~ m,$known_bad_clause_3_wording,) {
			while ($msg !~ /^.*``.*\n/) {
				last if (!$msg);
				$msg =~ s/^.*\n//o;
			}
			$msg =~ s/^.*``//o;
			$msg =~ s/\n.*``//o;
			$msg =~ s/''.*$//o;
		}

		# XXX: pcap &c - add to known_bad_clause_3_wording but
		# that code seems to have problems.  Easier to add a
		# hack here, shouldn't affect good clause 3.
		$msg =~ s/''\s+Neither the name.*$//;

		# *roff
		while ($msg =~ /^\.\\"\s*/) {
			$msg =~ s/^\.\\"\s*//o;
		}
		while ($msg =~ /\n\.\\"\s*/) {
			$msg =~ s/\n\.\\"\s*/\n/o;
		}
		$msg =~ s/\n\.\\"\s*$/\n/g;

		# C++/C99
		while ($msg =~ /^\s*\/\/\s*/) {
			$msg =~ s/^\s*\/\/\s*//o;
		}
		while ($msg =~ /\n\s*\/\/\s*$/) {
			$msg =~ s/\n\s*\/\/\s*$//o;
		}
		$msg =~ s/\n\s*\/\/\s*/\n/g;

		# C
		while ($msg =~ /^\s*\*\s*/) {
			$msg =~ s/^\s*\*\s*//o;
		}
		while ($msg =~ /\n\s*\*\s*$/) {
			$msg =~ s/\n\s*\*\s*$//o;
		}
		$msg =~ s/\n\s*\*\s*/\n/g;

		# texinfo @c
		while ($msg =~ /^\s*\@c\s+/) {
			$msg =~ s/^\s*\@c\s+//o;
		}
		while ($msg =~ /\n\s*\@c\s+$/) {
			$msg =~ s/\n\s*\@c\s+$//o;
		}
		$msg =~ s/\n\s*\@c\s+/\n/g;

		$msg =~ s/^REM\s*//g;			# BASIC?!?
		$msg =~ s/\nREM\s*/\n/g;		# BASIC?!?
		$msg =~ s/^dnl\s*//g;			# m4
		$msg =~ s/\ndnl\s*/\n/g;		# m4
		$msg =~ s/^\s+-\s+//g;			# seen in docbook files
		$msg =~ s/\n\s+-\s+/ /g;		#
		$msg =~ s/^[#\\\|";]+\s*//g;		# sh etc.
		$msg =~ s/\n[#\\\|";]+\s*/\n/g;		# sh etc.
		$msg =~ s/^[ 	*]*//g;      		# C
		$msg =~ s/\n[ 	*]*/\n/g;    		# C

		$msg =~ s/\@cartouche\n//;              # texinfo

		$msg =~ s/
//g;
		$msg =~ s/\s*\n/\n/g;
		$msg =~ s/^\s*//;
		$msg =~ s/\\\@/\@/g;
		$msg =~ s/\n\n/\n/g;
	        $msg =~ s/^\s*``//;
	        $msg =~ s/''\s*$//;
		$msg =~ s/^\"//o;
		$msg =~ s/\"$//o;
		$msg =~ s/\"\.$/./o;

		# Fix ISO-646-SE spelling of Lule\(oa
		$msg =~ s/Lule\}/Lule\\(oa/g;

		# Collapse multiple spaces between words.  There are a
		# few entries with "by__Name" that affects sorting.
		$msg =~ s/(\w)  +(\w)/$1 $2/g;

		# Split up into separate paragraphs
		#
		$msgs=$msg;
		$msgs=~s/(This (software|product))/|$1/g;
		$msgs=~s,^\|,,;
	      msg:
		foreach $msg (split(/\|/, $msgs)) {
		    while ($msg =~ /[\n\s]+$/) {
			$msg =~ s/[\n\s]+$//o;
		    }
		    next if ($msg eq "");
		    if ($comments) {
			print ".\\\" File $fn:\n";
			print "$msg";
			print "\n\n";
		    }

		    my $key = lc($msg);	# ignore difference in case
		    $key =~ s/\n/ /g;	# ignore difference in line breaks
		    $key =~ s/\.$//g;	# drop the final dot

		    # push organizations ("by the") to the end of the
		    # sorting order
		    $key =~ s/(developed by) the/$1 ~the/;

		    if (defined $copyrights{$key}) {
			if ($copyrights{$key} !~ /\.$/ && $msg =~ /\.$/) {
			    print "already there, without dot - overriding!\n"
				if 1 || $debug;
			}
			else {
			    next msg;
			}
		    }

		    $copyrights{$key} = $msg;
		}

	    } else {
		print "?> $_" if $debug;

                if ($fn !~ m,$known_bad_clause_3_wording,) {
		    warning($fn, "bad clause 3?");
                }
		last line;
	    }
	}
    }
    close(F);
}


if ($html) {
    print "<ul>\n";
    foreach $key (sort keys %copyrights) {
	my $msg = $copyrights{$key};
	print "<li>$msg</li>\n";
    }
    print "</ul>\n";
} elsif ($xml) {
    foreach $key (sort keys %copyrights) {
	my $msg = $copyrights{$key};
	print "<listitem>$msg</listitem>\n";
    }
} else {
    print "------------------------------------------------------------\n";

    $firsttime=1;
    foreach $key (sort keys %copyrights) {
	my $msg = $copyrights{$key};
	if ($firsttime) {
	    $firsttime=0;
	} else {
	    print ".It\n";
	}
	print "$msg\n";
    }
}