#!/usr/bin/perl -w # Eeva Ahonen, June 2003 # #---------------------------------------------------------------------- # Marks lines with tags if they are preceded by more an empty # line and followed by one empty line followed by a text line. # Skips everything inside <PUBLICATION_INFO> and <REF> tags. # First text line is tagged <BOOK_TITLE> and if it is followed by another # text line, that one is tagged <BOOK_TITLE_2>. #---------------------------------------------------------------------- use strict; my $skip = 0; # 1 for text to skip, 0 for text to be processed my $empty_lines = 0; # for counting empty lines my $beginning = 0; # 0 = beginning of text, first line yet to come # 1 = first line gone, expecting secondary title # 2 = secondary title gone, or there wasn't any # something that has to be done in order to determine what # to print out based on following 2 lines... my $ln_pos_title = ""; my $ln_after_title = ""; my $title_check = 0; while (<>) { if ( $_ =~ /<PUBLICATION_INFO>|REF>/ ) { # entering text to be skipped $skip = 1; print; next; } if ( $_ =~ /<\/PUBLICATION_INFO>|<\/\w*>/ ) { # exiting text to be skipped $skip = 0; print; next; } if ( $skip == 1) { # text to be skipped print; next; } if ( $_ =~ /^\s*$/ ) { # empty line if ($title_check == 1) { $ln_after_title = $_; # to be printed out later $title_check = 2; } else { if ($title_check == 2) { print $ln_pos_title; $title_check = 0; } $empty_lines++; print; } } #--------------------------------------------------------- elsif ($beginning < 2) { #beginning of text, special case if ($beginning == 0) { print "<BOOK_TITLE_1>\n" . $_ . "</BOOK_TITLE_1>\n"; $beginning = 1; $empty_lines = 0; } elsif (($beginning == 1)&&($empty_lines < 2)) { print "<BOOK_TITLE_2>\n" . $_ . "</BOOK_TITLE_2>\n"; $beginning = 2; $empty_lines = 0; } else { # no secondary title print; $beginning = 2; } } #--------------------------------------------------------- # normal = not beginning of text elsif (( $empty_lines > 0 ) && ($title_check == 0)) { # possible title $ln_pos_title = $_; # to remember (and be printed out) later $title_check = 1; $empty_lines = 0; $beginning = 2; } elsif ( $title_check == 2 ) { # tag $pos_title as a title and print everything out print "<TITLE>\n" . $ln_pos_title . "\n"; print $ln_after_title; print $_; $title_check = 0; $empty_lines = 0; $beginning = 2; } else { # normal text if ($title_check == 1) { print $ln_pos_title; print $ln_after_title; $title_check = 0; } print; $empty_lines = 0; $beginning = 2; } }